From 92ce6db9ee7666a347fccf0f72ba3225b199d6d1 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 13 Aug 2021 11:09:18 -0700 Subject: [PATCH 001/700] [NFC] Rename AttributeList::hasFnAttribute() -> hasFnAttr() This is more consistent with similar methods. --- clang/lib/CodeGen/CGCall.cpp | 2 +- llvm/include/llvm/IR/Attributes.h | 4 +- llvm/include/llvm/IR/Function.h | 4 +- llvm/include/llvm/IR/InstrTypes.h | 2 +- llvm/lib/Analysis/ModuleSummaryAnalysis.cpp | 2 +- llvm/lib/CodeGen/MachineVerifier.cpp | 2 +- .../SelectionDAG/SelectionDAGBuilder.cpp | 2 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 2 +- llvm/lib/IR/Attributes.cpp | 4 +- llvm/lib/IR/Instructions.cpp | 4 +- llvm/lib/IR/Verifier.cpp | 48 +++++++++---------- .../Target/AArch64/AArch64ISelLowering.cpp | 8 ++-- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +-- llvm/lib/Target/ARM/ARMISelLowering.cpp | 4 +- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +- .../Target/X86/X86IndirectBranchTracking.cpp | 2 +- llvm/unittests/IR/AttributesTest.cpp | 4 +- 18 files changed, 52 insertions(+), 54 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 43be6755a0745..04ecfacf02c02 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -5232,7 +5232,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, CannotThrow = true; } else { // Otherwise, nounwind call sites will never throw. - CannotThrow = Attrs.hasFnAttribute(llvm::Attribute::NoUnwind); + CannotThrow = Attrs.hasFnAttr(llvm::Attribute::NoUnwind); if (auto *FPtr = dyn_cast(CalleePtr)) if (FPtr->hasFnAttribute(llvm::Attribute::NoUnwind)) diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h index d28a4b4de1bf7..019fe45094c98 100644 --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -654,11 +654,11 @@ class AttributeList { /// Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but /// may be faster. - bool hasFnAttribute(Attribute::AttrKind Kind) const; + bool hasFnAttr(Attribute::AttrKind Kind) const; /// Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but /// may be faster. - bool hasFnAttribute(StringRef Kind) const; + bool hasFnAttr(StringRef Kind) const; /// Return true if the specified attribute is set for at least one /// parameter or for the return value. If Index is not nullptr, the index diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index 6d98f53157d27..d1c8a231d45aa 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -353,12 +353,12 @@ class Function : public GlobalObject, public ilist_node { /// Return true if the function has the attribute. bool hasFnAttribute(Attribute::AttrKind Kind) const { - return AttributeSets.hasFnAttribute(Kind); + return AttributeSets.hasFnAttr(Kind); } /// Return true if the function has the attribute. bool hasFnAttribute(StringRef Kind) const { - return AttributeSets.hasFnAttribute(Kind); + return AttributeSets.hasFnAttr(Kind); } /// Return the attribute for the given attribute kind. diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index ef2c279ed4552..aab51f113fe38 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -2258,7 +2258,7 @@ class CallBase : public Instruction { bool hasFnAttrOnCalledFunction(StringRef Kind) const; template bool hasFnAttrImpl(AttrKind Kind) const { - if (Attrs.hasFnAttribute(Kind)) + if (Attrs.hasFnAttr(Kind)) return true; // Operand bundles override attributes on the called function, but don't diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index 0214f8901a46d..a85dd7553b086 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -479,7 +479,7 @@ static void computeFunctionSummary( F.hasFnAttribute(Attribute::NoRecurse), F.returnDoesNotAlias(), // FIXME: refactor this to use the same code that inliner is using. // Don't try to import functions with noinline attribute. - F.getAttributes().hasFnAttribute(Attribute::NoInline), + F.getAttributes().hasFnAttr(Attribute::NoInline), F.hasFnAttribute(Attribute::AlwaysInline)}; std::vector ParamAccesses; if (auto *SSI = GetSSICallback(F)) diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 7e3198af02cd6..2b980ecb0236a 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -1392,7 +1392,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { AttributeList Attrs = Intrinsic::getAttributes(MF->getFunction().getContext(), static_cast(IntrID)); - bool DeclHasSideEffects = !Attrs.hasFnAttribute(Attribute::ReadNone); + bool DeclHasSideEffects = !Attrs.hasFnAttr(Attribute::ReadNone); if (NoSideEffects && DeclHasSideEffects) { report("G_INTRINSIC used with intrinsic that accesses memory", MI); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 074ddaf4848a3..0786dc395b409 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6419,7 +6419,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, SDValue Op = getValue(I.getArgOperand(0)); SDNodeFlags Flags; Flags.setNoFPExcept( - !F.getAttributes().hasFnAttribute(llvm::Attribute::StrictFP)); + !F.getAttributes().hasFnAttr(llvm::Attribute::StrictFP)); // If ISD::ISNAN should be expanded, do it right now, because the expansion // can use illegal types. Making expansion early allows to legalize these diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index dd282a3beae83..cdb3aedf4d234 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4336,7 +4336,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // When division is cheap or optimizing for minimum size, // fall through to DIVREM creation by skipping this fold. - if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize)) { + if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttr(Attribute::MinSize)) { if (N0.getOpcode() == ISD::UREM) { if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl)) return Folded; diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index ea71472a47cad..59c0ced90fbb7 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -1397,11 +1397,11 @@ bool AttributeList::hasAttributes(unsigned Index) const { return getAttributes(Index).hasAttributes(); } -bool AttributeList::hasFnAttribute(Attribute::AttrKind Kind) const { +bool AttributeList::hasFnAttr(Attribute::AttrKind Kind) const { return pImpl && pImpl->hasFnAttribute(Kind); } -bool AttributeList::hasFnAttribute(StringRef Kind) const { +bool AttributeList::hasFnAttr(StringRef Kind) const { return hasAttribute(AttributeList::FunctionIndex, Kind); } diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 298cf0598224b..652a87d929ab2 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -352,13 +352,13 @@ bool CallBase::paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const { bool CallBase::hasFnAttrOnCalledFunction(Attribute::AttrKind Kind) const { if (const Function *F = getCalledFunction()) - return F->getAttributes().hasFnAttribute(Kind); + return F->getAttributes().hasFnAttr(Kind); return false; } bool CallBase::hasFnAttrOnCalledFunction(StringRef Kind) const { if (const Function *F = getCalledFunction()) - return F->getAttributes().hasFnAttribute(Kind); + return F->getAttributes().hasFnAttr(Kind); return false; } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 1a66755f4668f..eb4a3b0893099 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -1824,7 +1824,7 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty, void Verifier::checkUnsignedBaseTenFuncAttr(AttributeList Attrs, StringRef Attr, const Value *V) { - if (Attrs.hasFnAttribute(Attr)) { + if (Attrs.hasFnAttr(Attr)) { StringRef S = Attrs.getAttribute(AttributeList::FunctionIndex, Attr) .getValueAsString(); unsigned N; @@ -1939,50 +1939,50 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, "' does not apply to functions!", V); - Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) && - Attrs.hasFnAttribute(Attribute::ReadOnly)), + Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) && + Attrs.hasFnAttr(Attribute::ReadOnly)), "Attributes 'readnone and readonly' are incompatible!", V); - Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) && - Attrs.hasFnAttribute(Attribute::WriteOnly)), + Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) && + Attrs.hasFnAttr(Attribute::WriteOnly)), "Attributes 'readnone and writeonly' are incompatible!", V); - Assert(!(Attrs.hasFnAttribute(Attribute::ReadOnly) && - Attrs.hasFnAttribute(Attribute::WriteOnly)), + Assert(!(Attrs.hasFnAttr(Attribute::ReadOnly) && + Attrs.hasFnAttr(Attribute::WriteOnly)), "Attributes 'readonly and writeonly' are incompatible!", V); - Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) && - Attrs.hasFnAttribute(Attribute::InaccessibleMemOrArgMemOnly)), + Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) && + Attrs.hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly)), "Attributes 'readnone and inaccessiblemem_or_argmemonly' are " "incompatible!", V); - Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) && - Attrs.hasFnAttribute(Attribute::InaccessibleMemOnly)), + Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) && + Attrs.hasFnAttr(Attribute::InaccessibleMemOnly)), "Attributes 'readnone and inaccessiblememonly' are incompatible!", V); - Assert(!(Attrs.hasFnAttribute(Attribute::NoInline) && - Attrs.hasFnAttribute(Attribute::AlwaysInline)), + Assert(!(Attrs.hasFnAttr(Attribute::NoInline) && + Attrs.hasFnAttr(Attribute::AlwaysInline)), "Attributes 'noinline and alwaysinline' are incompatible!", V); - if (Attrs.hasFnAttribute(Attribute::OptimizeNone)) { - Assert(Attrs.hasFnAttribute(Attribute::NoInline), + if (Attrs.hasFnAttr(Attribute::OptimizeNone)) { + Assert(Attrs.hasFnAttr(Attribute::NoInline), "Attribute 'optnone' requires 'noinline'!", V); - Assert(!Attrs.hasFnAttribute(Attribute::OptimizeForSize), + Assert(!Attrs.hasFnAttr(Attribute::OptimizeForSize), "Attributes 'optsize and optnone' are incompatible!", V); - Assert(!Attrs.hasFnAttribute(Attribute::MinSize), + Assert(!Attrs.hasFnAttr(Attribute::MinSize), "Attributes 'minsize and optnone' are incompatible!", V); } - if (Attrs.hasFnAttribute(Attribute::JumpTable)) { + if (Attrs.hasFnAttr(Attribute::JumpTable)) { const GlobalValue *GV = cast(V); Assert(GV->hasGlobalUnnamedAddr(), "Attribute 'jumptable' requires 'unnamed_addr'", V); } - if (Attrs.hasFnAttribute(Attribute::AllocSize)) { + if (Attrs.hasFnAttr(Attribute::AllocSize)) { std::pair> Args = Attrs.getAllocSizeArgs(AttributeList::FunctionIndex); @@ -2009,7 +2009,7 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, return; } - if (Attrs.hasFnAttribute(Attribute::VScaleRange)) { + if (Attrs.hasFnAttr(Attribute::VScaleRange)) { std::pair Args = Attrs.getVScaleRangeArgs(AttributeList::FunctionIndex); @@ -2017,7 +2017,7 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, CheckFailed("'vscale_range' minimum cannot be greater than maximum", V); } - if (Attrs.hasFnAttribute("frame-pointer")) { + if (Attrs.hasFnAttr("frame-pointer")) { StringRef FP = Attrs.getAttribute(AttributeList::FunctionIndex, "frame-pointer").getValueAsString(); if (FP != "all" && FP != "non-leaf" && FP != "none") @@ -2334,7 +2334,7 @@ void Verifier::visitFunction(const Function &F) { // On function declarations/definitions, we do not support the builtin // attribute. We do not check this in VerifyFunctionAttrs since that is // checking for Attributes that can/can not ever be on functions. - Assert(!Attrs.hasFnAttribute(Attribute::Builtin), + Assert(!Attrs.hasFnAttr(Attribute::Builtin), "Attribute 'builtin' can only be applied to a callsite.", &F); Assert(!Attrs.hasAttrSomewhere(Attribute::ElementType), @@ -3071,14 +3071,14 @@ void Verifier::visitCallBase(CallBase &Call) { Assert(Callee->getValueType() == FTy, "Intrinsic called with incompatible signature", Call); - if (Attrs.hasFnAttribute(Attribute::Speculatable)) { + if (Attrs.hasFnAttr(Attribute::Speculatable)) { // Don't allow speculatable on call sites, unless the underlying function // declaration is also speculatable. Assert(Callee && Callee->isSpeculatable(), "speculatable attribute may not apply to call sites", Call); } - if (Attrs.hasFnAttribute(Attribute::Preallocated)) { + if (Attrs.hasFnAttr(Attribute::Preallocated)) { Assert(Call.getCalledFunction()->getIntrinsicID() == Intrinsic::call_preallocated_arg, "preallocated as a call site attribute can only be on " diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 732310c58ec3c..baa3feda74d3a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12053,8 +12053,7 @@ SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic, EVT AArch64TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { - bool CanImplicitFloat = - !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); + bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; // Only use AdvSIMD to implement memset of 32-byte and above. It would have @@ -12084,8 +12083,7 @@ EVT AArch64TargetLowering::getOptimalMemOpType( LLT AArch64TargetLowering::getOptimalMemOpLLT( const MemOp &Op, const AttributeList &FuncAttributes) const { - bool CanImplicitFloat = - !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); + bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; // Only use AdvSIMD to implement memset of 32-byte and above. It would have @@ -17851,7 +17849,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. - bool OptSize = Attr.hasFnAttribute(Attribute::MinSize); + bool OptSize = Attr.hasFnAttr(Attribute::MinSize); return OptSize && !VT.isVector(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 41dcffdf2d459..0d21d77d60470 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1066,7 +1066,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, AMDGPU::lookupRsrcIntrinsic(IntrID)) { AttributeList Attr = Intrinsic::getAttributes(CI.getContext(), (Intrinsic::ID)IntrID); - if (Attr.hasFnAttribute(Attribute::ReadNone)) + if (Attr.hasFnAttr(Attribute::ReadNone)) return false; SIMachineFunctionInfo *MFI = MF.getInfo(); @@ -1081,7 +1081,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, } Info.flags = MachineMemOperand::MODereferenceable; - if (Attr.hasFnAttribute(Attribute::ReadOnly)) { + if (Attr.hasFnAttr(Attribute::ReadOnly)) { unsigned DMaskLanes = 4; if (RsrcIntr->IsImage) { @@ -1105,7 +1105,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, // FIXME: What does alignment mean for an image? Info.opc = ISD::INTRINSIC_W_CHAIN; Info.flags |= MachineMemOperand::MOLoad; - } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) { + } else if (Attr.hasFnAttr(Attribute::WriteOnly)) { Info.opc = ISD::INTRINSIC_VOID; Type *DataTy = CI.getArgOperand(0)->getType(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 9e7f40301d92e..715725aa093a4 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2289,7 +2289,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool PreferIndirect = false; // Determine whether this is a non-secure function call. - if (CLI.CB && CLI.CB->getAttributes().hasFnAttribute("cmse_nonsecure_call")) + if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call")) isCmseNSCall = true; // Disable tail calls if they're not supported. @@ -18134,7 +18134,7 @@ EVT ARMTargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { // See if we can use NEON instructions for this... if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() && - !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { + !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { bool Fast; if (Op.size() >= 16 && (Op.isAligned(Align(16)) || diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index b7d0f57471f0c..fdc413d08b77d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -703,7 +703,7 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) { for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) { const Function *F = &*FI; - if (F->getAttributes().hasFnAttribute("nvptx-libcall-callee")) { + if (F->getAttributes().hasFnAttr("nvptx-libcall-callee")) { emitDeclaration(F, O); continue; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 12f6804b3d764..72e9a2b4b29a7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2365,7 +2365,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, /// preferred vector width. EVT X86TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { - if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { + if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { if (Op.size() >= 16 && (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) { // FIXME: Check if unaligned 64-byte accesses are slow. @@ -52761,7 +52761,7 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. - bool OptSize = Attr.hasFnAttribute(Attribute::MinSize); + bool OptSize = Attr.hasFnAttr(Attribute::MinSize); return OptSize && !VT.isVector(); } diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp index 85410c54a4d2e..732b2b1a5ada6 100644 --- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp +++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp @@ -92,7 +92,7 @@ static bool IsCallReturnTwice(llvm::MachineOperand &MOp) { if (!CalleeFn) return false; AttributeList Attrs = CalleeFn->getAttributes(); - return Attrs.hasFnAttribute(Attribute::ReturnsTwice); + return Attrs.hasFnAttr(Attribute::ReturnsTwice); } bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) { diff --git a/llvm/unittests/IR/AttributesTest.cpp b/llvm/unittests/IR/AttributesTest.cpp index 37f9f310edabc..4ba790058f8ce 100644 --- a/llvm/unittests/IR/AttributesTest.cpp +++ b/llvm/unittests/IR/AttributesTest.cpp @@ -64,12 +64,12 @@ TEST(Attributes, AddAttributes) { AttrBuilder B; B.addAttribute(Attribute::NoReturn); AL = AL.addAttributes(C, AttributeList::FunctionIndex, AttributeSet::get(C, B)); - EXPECT_TRUE(AL.hasFnAttribute(Attribute::NoReturn)); + EXPECT_TRUE(AL.hasFnAttr(Attribute::NoReturn)); B.clear(); B.addAttribute(Attribute::SExt); AL = AL.addAttributes(C, AttributeList::ReturnIndex, B); EXPECT_TRUE(AL.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)); - EXPECT_TRUE(AL.hasFnAttribute(Attribute::NoReturn)); + EXPECT_TRUE(AL.hasFnAttr(Attribute::NoReturn)); } TEST(Attributes, RemoveAlign) { From 80ea2bb57450a65cc724565ecfc9971ad93a3f15 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 13 Aug 2021 11:16:52 -0700 Subject: [PATCH 002/700] [NFC] Rename AttributeList::getParam/Ret/FnAttributes() -> get*Attributes() This is more consistent with similar methods. --- clang/lib/CodeGen/CodeGenModule.cpp | 8 +++---- .../include/llvm/Analysis/TargetLibraryInfo.h | 2 +- llvm/include/llvm/IR/Attributes.h | 6 ++--- llvm/lib/AsmParser/LLParser.cpp | 8 +++---- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 +- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 6 ++--- llvm/lib/IR/AsmWriter.cpp | 24 +++++++++---------- llvm/lib/IR/Attributes.cpp | 6 ++--- llvm/lib/IR/Function.cpp | 4 ++-- llvm/lib/IR/Verifier.cpp | 14 +++++------ llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 3 +-- .../WebAssemblyLowerEmscriptenEHSjLj.cpp | 9 ++++--- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 4 ++-- llvm/lib/Transforms/IPO/ArgumentPromotion.cpp | 16 ++++++------- llvm/lib/Transforms/IPO/Attributor.cpp | 12 +++++----- .../IPO/DeadArgumentElimination.cpp | 22 ++++++++--------- llvm/lib/Transforms/IPO/IROutliner.cpp | 3 +-- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 2 +- .../Transforms/IPO/ThinLTOBitcodeWriter.cpp | 6 ++--- .../lib/Transforms/IPO/WholeProgramDevirt.cpp | 6 ++--- .../InstCombine/InstCombineCalls.cpp | 18 +++++++------- .../Instrumentation/DataFlowSanitizer.cpp | 8 +++---- .../Instrumentation/MemorySanitizer.cpp | 2 +- .../Scalar/RewriteStatepointsForGC.cpp | 6 ++--- .../Transforms/Utils/AssumeBundleBuilder.cpp | 2 +- .../Transforms/Utils/CallPromotionUtils.cpp | 6 ++--- llvm/lib/Transforms/Utils/CloneFunction.cpp | 6 ++--- llvm/lib/Transforms/Utils/CodeExtractor.cpp | 2 +- llvm/lib/Transforms/Utils/InlineFunction.cpp | 8 +++---- .../lib/Transforms/Utils/SimplifyLibCalls.cpp | 4 ++-- llvm/tools/bugpoint/CrashDebugger.cpp | 4 ++-- llvm/unittests/IR/IRBuilderTest.cpp | 8 +++---- 32 files changed, 117 insertions(+), 120 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 13520861fe9b6..fa8312d30ad2d 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -4749,7 +4749,7 @@ static void replaceUsesOfNonProtoConstant(llvm::Constant *old, } // Add any parameter attributes. - newArgAttrs.push_back(oldAttrs.getParamAttributes(argNo)); + newArgAttrs.push_back(oldAttrs.getParamAttrs(argNo)); argNo++; } if (dontTransform) @@ -4777,9 +4777,9 @@ static void replaceUsesOfNonProtoConstant(llvm::Constant *old, if (!newCall->getType()->isVoidTy()) newCall->takeName(callSite); - newCall->setAttributes(llvm::AttributeList::get( - newFn->getContext(), oldAttrs.getFnAttributes(), - oldAttrs.getRetAttributes(), newArgAttrs)); + newCall->setAttributes( + llvm::AttributeList::get(newFn->getContext(), oldAttrs.getFnAttrs(), + oldAttrs.getRetAttrs(), newArgAttrs)); newCall->setCallingConv(callSite->getCallingConv()); // Finally, remove the old call, replacing any uses with the new one. diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h index 22bfeda0efd0d..c27e109e8687b 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h @@ -238,7 +238,7 @@ class TargetLibraryInfo { else { // Disable individual libc/libm calls in TargetLibraryInfo. LibFunc LF; - AttributeSet FnAttrs = (*F)->getAttributes().getFnAttributes(); + AttributeSet FnAttrs = (*F)->getAttributes().getFnAttrs(); for (const Attribute &Attr : FnAttrs) { if (!Attr.isStringAttribute()) continue; diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h index 019fe45094c98..3b1cead212c85 100644 --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -620,13 +620,13 @@ class AttributeList { /// The attributes for the argument or parameter at the given index are /// returned. - AttributeSet getParamAttributes(unsigned ArgNo) const; + AttributeSet getParamAttrs(unsigned ArgNo) const; /// The attributes for the ret value are returned. - AttributeSet getRetAttributes() const; + AttributeSet getRetAttrs() const; /// The function attributes are returned. - AttributeSet getFnAttributes() const; + AttributeSet getFnAttrs() const; /// Return true if the attribute exists at the given index. bool hasAttribute(unsigned Index, Attribute::AttrKind Kind) const; diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 799cb03c8c8c5..156b46cc94534 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -140,7 +140,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { if (Function *Fn = dyn_cast(V)) { AttributeList AS = Fn->getAttributes(); - AttrBuilder FnAttrs(AS.getFnAttributes()); + AttrBuilder FnAttrs(AS.getFnAttrs()); AS = AS.removeAttributes(Context, AttributeList::FunctionIndex); FnAttrs.merge(B); @@ -157,7 +157,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { Fn->setAttributes(AS); } else if (CallInst *CI = dyn_cast(V)) { AttributeList AS = CI->getAttributes(); - AttrBuilder FnAttrs(AS.getFnAttributes()); + AttrBuilder FnAttrs(AS.getFnAttrs()); AS = AS.removeAttributes(Context, AttributeList::FunctionIndex); FnAttrs.merge(B); AS = AS.addAttributes(Context, AttributeList::FunctionIndex, @@ -165,7 +165,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { CI->setAttributes(AS); } else if (InvokeInst *II = dyn_cast(V)) { AttributeList AS = II->getAttributes(); - AttrBuilder FnAttrs(AS.getFnAttributes()); + AttrBuilder FnAttrs(AS.getFnAttrs()); AS = AS.removeAttributes(Context, AttributeList::FunctionIndex); FnAttrs.merge(B); AS = AS.addAttributes(Context, AttributeList::FunctionIndex, @@ -173,7 +173,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { II->setAttributes(AS); } else if (CallBrInst *CBI = dyn_cast(V)) { AttributeList AS = CBI->getAttributes(); - AttrBuilder FnAttrs(AS.getFnAttributes()); + AttrBuilder FnAttrs(AS.getFnAttrs()); AS = AS.removeAttributes(Context, AttributeList::FunctionIndex); FnAttrs.merge(B); AS = AS.addAttributes(Context, AttributeList::FunctionIndex, diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index d5e366c21f7de..bc967f60db896 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -3270,7 +3270,7 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef Record) { } if (Record.size() > 12) { - auto AS = getAttributes(Record[12]).getFnAttributes(); + auto AS = getAttributes(Record[12]).getFnAttrs(); NewGV->setAttributes(AS); } diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index f90774412d417..6ec3b4d891a03 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -44,11 +44,11 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { // Get the function's current attributes. auto Attrs = Fn.getAttributes(); - auto FnAttrs = Attrs.getFnAttributes(); - auto RetAttrs = Attrs.getRetAttributes(); + auto FnAttrs = Attrs.getFnAttrs(); + auto RetAttrs = Attrs.getRetAttrs(); SmallVector ArgAttrs; for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo) - ArgAttrs.emplace_back(Attrs.getParamAttributes(ArgNo)); + ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo)); #define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet; #include "llvm/Frontend/OpenMP/OMPKinds.def" diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 5842d400eb67b..b3c11133e8308 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -988,7 +988,7 @@ void SlotTracker::processModule() { // Add all the function attributes to the table. // FIXME: Add attributes of other objects? - AttributeSet FnAttrs = F.getAttributes().getFnAttributes(); + AttributeSet FnAttrs = F.getAttributes().getFnAttrs(); if (FnAttrs.hasAttributes()) CreateAttributeSetSlot(FnAttrs); } @@ -1029,7 +1029,7 @@ void SlotTracker::processFunction() { // target may not be linked into the optimizer. if (const auto *Call = dyn_cast(&I)) { // Add all the call attributes to the table. - AttributeSet Attrs = Call->getAttributes().getFnAttributes(); + AttributeSet Attrs = Call->getAttributes().getFnAttrs(); if (Attrs.hasAttributes()) CreateAttributeSetSlot(Attrs); } @@ -3683,7 +3683,7 @@ void AssemblyWriter::printFunction(const Function *F) { const AttributeList &Attrs = F->getAttributes(); if (Attrs.hasAttributes(AttributeList::FunctionIndex)) { - AttributeSet AS = Attrs.getFnAttributes(); + AttributeSet AS = Attrs.getFnAttrs(); std::string AttrStr; for (const Attribute &Attr : AS) { @@ -3737,7 +3737,7 @@ void AssemblyWriter::printFunction(const Function *F) { // Output type... TypePrinter.print(FT->getParamType(I), Out); - AttributeSet ArgAttrs = Attrs.getParamAttributes(I); + AttributeSet ArgAttrs = Attrs.getParamAttrs(I); if (ArgAttrs.hasAttributes()) { Out << ' '; writeAttributeSet(ArgAttrs); @@ -3749,7 +3749,7 @@ void AssemblyWriter::printFunction(const Function *F) { // Insert commas as we go... the first arg doesn't get a comma if (Arg.getArgNo() != 0) Out << ", "; - printArgument(&Arg, Attrs.getParamAttributes(Arg.getArgNo())); + printArgument(&Arg, Attrs.getParamAttrs(Arg.getArgNo())); } } @@ -3770,7 +3770,7 @@ void AssemblyWriter::printFunction(const Function *F) { Mod->getDataLayout().getProgramAddressSpace() != 0) Out << " addrspace(" << F->getAddressSpace() << ")"; if (Attrs.hasAttributes(AttributeList::FunctionIndex)) - Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttributes()); + Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttrs()); if (F->hasSection()) { Out << " section \""; printEscapedString(F->getSection(), Out); @@ -4144,7 +4144,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) { for (unsigned op = 0, Eop = CI->getNumArgOperands(); op < Eop; ++op) { if (op > 0) Out << ", "; - writeParamOperand(CI->getArgOperand(op), PAL.getParamAttributes(op)); + writeParamOperand(CI->getArgOperand(op), PAL.getParamAttrs(op)); } // Emit an ellipsis if this is a musttail call in a vararg function. This @@ -4156,7 +4156,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) { Out << ')'; if (PAL.hasAttributes(AttributeList::FunctionIndex)) - Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes()); + Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs()); writeOperandBundles(CI); } else if (const InvokeInst *II = dyn_cast(&I)) { @@ -4189,12 +4189,12 @@ void AssemblyWriter::printInstruction(const Instruction &I) { for (unsigned op = 0, Eop = II->getNumArgOperands(); op < Eop; ++op) { if (op) Out << ", "; - writeParamOperand(II->getArgOperand(op), PAL.getParamAttributes(op)); + writeParamOperand(II->getArgOperand(op), PAL.getParamAttrs(op)); } Out << ')'; if (PAL.hasAttributes(AttributeList::FunctionIndex)) - Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes()); + Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs()); writeOperandBundles(II); @@ -4229,12 +4229,12 @@ void AssemblyWriter::printInstruction(const Instruction &I) { for (unsigned op = 0, Eop = CBI->getNumArgOperands(); op < Eop; ++op) { if (op) Out << ", "; - writeParamOperand(CBI->getArgOperand(op), PAL.getParamAttributes(op)); + writeParamOperand(CBI->getArgOperand(op), PAL.getParamAttrs(op)); } Out << ')'; if (PAL.hasAttributes(AttributeList::FunctionIndex)) - Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes()); + Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs()); writeOperandBundles(CBI); diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index 59c0ced90fbb7..9bdaa56bb1cd7 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -1372,15 +1372,15 @@ AttributeList AttributeList::addVScaleRangeAttr(LLVMContext &C, unsigned Index, // AttributeList Accessor Methods //===----------------------------------------------------------------------===// -AttributeSet AttributeList::getParamAttributes(unsigned ArgNo) const { +AttributeSet AttributeList::getParamAttrs(unsigned ArgNo) const { return getAttributes(ArgNo + FirstArgIndex); } -AttributeSet AttributeList::getRetAttributes() const { +AttributeSet AttributeList::getRetAttrs() const { return getAttributes(ReturnIndex); } -AttributeSet AttributeList::getFnAttributes() const { +AttributeSet AttributeList::getFnAttrs() const { return getAttributes(FunctionIndex); } diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 2276c40ab6f09..2755e5356c333 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -177,7 +177,7 @@ static Type *getMemoryParamAllocType(AttributeSet ParamAttrs, Type *ArgTy) { uint64_t Argument::getPassPointeeByValueCopySize(const DataLayout &DL) const { AttributeSet ParamAttrs = - getParent()->getAttributes().getParamAttributes(getArgNo()); + getParent()->getAttributes().getParamAttrs(getArgNo()); if (Type *MemTy = getMemoryParamAllocType(ParamAttrs, getType())) return DL.getTypeAllocSize(MemTy); return 0; @@ -185,7 +185,7 @@ uint64_t Argument::getPassPointeeByValueCopySize(const DataLayout &DL) const { Type *Argument::getPointeeInMemoryValueType() const { AttributeSet ParamAttrs = - getParent()->getAttributes().getParamAttributes(getArgNo()); + getParent()->getAttributes().getParamAttrs(getArgNo()); return getMemoryParamAllocType(ParamAttrs, getType()); } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index eb4a3b0893099..5fb1b99792791 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -1861,7 +1861,7 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, bool SawSwiftError = false; // Verify return value attributes. - AttributeSet RetAttrs = Attrs.getRetAttributes(); + AttributeSet RetAttrs = Attrs.getRetAttrs(); for (Attribute RetAttr : RetAttrs) Assert(RetAttr.isStringAttribute() || Attribute::canUseAsRetAttr(RetAttr.getKindAsEnum()), @@ -1874,7 +1874,7 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, // Verify parameter attributes. for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) { Type *Ty = FT->getParamType(i); - AttributeSet ArgAttrs = Attrs.getParamAttributes(i); + AttributeSet ArgAttrs = Attrs.getParamAttrs(i); if (!IsIntrinsic) { Assert(!ArgAttrs.hasAttribute(Attribute::ImmArg), @@ -1931,8 +1931,8 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, if (!Attrs.hasAttributes(AttributeList::FunctionIndex)) return; - verifyAttributeTypes(Attrs.getFnAttributes(), V); - for (Attribute FnAttr : Attrs.getFnAttributes()) + verifyAttributeTypes(Attrs.getFnAttrs(), V); + for (Attribute FnAttr : Attrs.getFnAttrs()) Assert(FnAttr.isStringAttribute() || Attribute::canUseAsFnAttr(FnAttr.getKindAsEnum()), "Attribute '" + FnAttr.getAsString() + @@ -2168,7 +2168,7 @@ void Verifier::verifyStatepoint(const CallBase &Call) { Call); if (TargetFuncType->isVarArg()) { - AttributeSet ArgAttrs = Attrs.getParamAttributes(5 + i); + AttributeSet ArgAttrs = Attrs.getParamAttrs(5 + i); Assert(!ArgAttrs.hasAttribute(Attribute::StructRet), "Attribute 'sret' cannot be used for vararg call arguments!", Call); @@ -3159,7 +3159,7 @@ void Verifier::visitCallBase(CallBase &Call) { // Check attributes on the varargs part. for (unsigned Idx = FTy->getNumParams(); Idx < Call.arg_size(); ++Idx) { Type *Ty = Call.getArgOperand(Idx)->getType(); - AttributeSet ArgAttrs = Attrs.getParamAttributes(Idx); + AttributeSet ArgAttrs = Attrs.getParamAttrs(Idx); verifyParameterAttrs(ArgAttrs, Ty, &Call); if (ArgAttrs.hasAttribute(Attribute::Nest)) { @@ -3323,7 +3323,7 @@ static AttrBuilder getParameterABIAttributes(int I, AttributeList Attrs) { Attribute::ByRef}; AttrBuilder Copy; for (auto AK : ABIAttrs) { - Attribute Attr = Attrs.getParamAttributes(I).getAttribute(AK); + Attribute Attr = Attrs.getParamAttrs(I).getAttribute(AK); if (Attr.isValid()) Copy.addAttribute(Attr); } diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 9dd35d5f44d1a..e9804516006ee 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -5241,8 +5241,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, return false; const IntegerType *IntTy = dyn_cast(CalleeFn->getReturnType()); - const AttributeSet &Attrs = - CalleeFn->getAttributes().getRetAttributes(); + const AttributeSet &Attrs = CalleeFn->getAttributes().getRetAttrs(); if (IntTy && IntTy->getBitWidth() <= 32) return Attrs.hasAttribute(SignExt ? Attribute::SExt : Attribute::ZExt); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 110273a9f480f..746f01b51c9c0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -450,9 +450,9 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) { ArgAttributes.push_back(AttributeSet()); // Copy the argument attributes from the original for (unsigned I = 0, E = CI->getNumArgOperands(); I < E; ++I) - ArgAttributes.push_back(InvokeAL.getParamAttributes(I)); + ArgAttributes.push_back(InvokeAL.getParamAttrs(I)); - AttrBuilder FnAttrs(InvokeAL.getFnAttributes()); + AttrBuilder FnAttrs(InvokeAL.getFnAttrs()); if (FnAttrs.contains(Attribute::AllocSize)) { // The allocsize attribute (if any) referes to parameters by index and needs // to be adjusted. @@ -466,9 +466,8 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) { } // Reconstruct the AttributesList based on the vector we constructed. - AttributeList NewCallAL = - AttributeList::get(C, AttributeSet::get(C, FnAttrs), - InvokeAL.getRetAttributes(), ArgAttributes); + AttributeList NewCallAL = AttributeList::get( + C, AttributeSet::get(C, FnAttrs), InvokeAL.getRetAttrs(), ArgAttributes); NewCall->setAttributes(NewCallAL); CI->replaceAllUsesWith(NewCall); diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 9ccd4e98320aa..80f59945b6e57 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -907,7 +907,7 @@ void CoroCloner::create() { // Bootstrap attributes by copying function attributes from the // original function. This should include optimization settings and so on. NewAttrs = NewAttrs.addAttributes(Context, AttributeList::FunctionIndex, - OrigAttrs.getFnAttributes()); + OrigAttrs.getFnAttrs()); addFramePointerAttrs(NewAttrs, Context, 0, Shape.FrameSize, Shape.FrameAlign); @@ -929,7 +929,7 @@ void CoroCloner::create() { } // Transfer the original function's attributes. - auto FnAttrs = OrigF.getAttributes().getFnAttributes(); + auto FnAttrs = OrigF.getAttributes().getFnAttrs(); NewAttrs = NewAttrs.addAttributes(Context, AttributeList::FunctionIndex, FnAttrs); break; diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index f670a101767e9..5d6b750d9a460 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -148,7 +148,7 @@ doPromotion(Function *F, SmallPtrSetImpl &ArgsToPromote, } else if (!ArgsToPromote.count(&*I)) { // Unchanged argument Params.push_back(I->getType()); - ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo)); + ArgAttrVec.push_back(PAL.getParamAttrs(ArgNo)); } else if (I->use_empty()) { // Dead argument (which are always marked as promotable) ++NumArgumentsDead; @@ -231,8 +231,8 @@ doPromotion(Function *F, SmallPtrSetImpl &ArgsToPromote, // Recompute the parameter attributes list based on the new arguments for // the function. - NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(), - PAL.getRetAttributes(), ArgAttrVec)); + NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttrs(), + PAL.getRetAttrs(), ArgAttrVec)); ArgAttrVec.clear(); F->getParent()->getFunctionList().insert(F->getIterator(), NF); @@ -257,7 +257,7 @@ doPromotion(Function *F, SmallPtrSetImpl &ArgsToPromote, ++I, ++AI, ++ArgNo) if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { Args.push_back(*AI); // Unmodified argument - ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo)); + ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo)); } else if (ByValArgsToTransform.count(&*I)) { // Emit a GEP and load for each element of the struct. Type *AgTy = I->getParamByValType(); @@ -325,7 +325,7 @@ doPromotion(Function *F, SmallPtrSetImpl &ArgsToPromote, // Push any varargs arguments on the list. for (; AI != CB.arg_end(); ++AI, ++ArgNo) { Args.push_back(*AI); - ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo)); + ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo)); } SmallVector OpBundles; @@ -341,9 +341,9 @@ doPromotion(Function *F, SmallPtrSetImpl &ArgsToPromote, NewCS = NewCall; } NewCS->setCallingConv(CB.getCallingConv()); - NewCS->setAttributes( - AttributeList::get(F->getContext(), CallPAL.getFnAttributes(), - CallPAL.getRetAttributes(), ArgAttrVec)); + NewCS->setAttributes(AttributeList::get(F->getContext(), + CallPAL.getFnAttrs(), + CallPAL.getRetAttrs(), ArgAttrVec)); NewCS->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg}); Args.clear(); ArgAttrVec.clear(); diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index 8fb82d60033f3..0585d2c716400 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -2159,7 +2159,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures( } else { NewArgumentTypes.push_back(Arg.getType()); NewArgumentAttributes.push_back( - OldFnAttributeList.getParamAttributes(Arg.getArgNo())); + OldFnAttributeList.getParamAttrs(Arg.getArgNo())); } } @@ -2190,8 +2190,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures( // the function. LLVMContext &Ctx = OldFn->getContext(); NewFn->setAttributes(AttributeList::get( - Ctx, OldFnAttributeList.getFnAttributes(), - OldFnAttributeList.getRetAttributes(), NewArgumentAttributes)); + Ctx, OldFnAttributeList.getFnAttrs(), OldFnAttributeList.getRetAttrs(), + NewArgumentAttributes)); // Since we have now created the new function, splice the body of the old // function right into the new function, leaving the old rotting hulk of the @@ -2236,7 +2236,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures( } else { NewArgOperands.push_back(ACS.getCallArgOperand(OldArgNum)); NewArgOperandAttributes.push_back( - OldCallAttributeList.getParamAttributes(OldArgNum)); + OldCallAttributeList.getParamAttrs(OldArgNum)); } } @@ -2266,8 +2266,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures( NewCB->setCallingConv(OldCB->getCallingConv()); NewCB->takeName(OldCB); NewCB->setAttributes(AttributeList::get( - Ctx, OldCallAttributeList.getFnAttributes(), - OldCallAttributeList.getRetAttributes(), NewArgOperandAttributes)); + Ctx, OldCallAttributeList.getFnAttrs(), + OldCallAttributeList.getRetAttrs(), NewArgOperandAttributes)); CallSitePairs.push_back({OldCB, NewCB}); return true; diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index eeff111f702d4..f95cffc71c08b 100644 --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -188,9 +188,9 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) { if (!PAL.isEmpty()) { SmallVector ArgAttrs; for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo) - ArgAttrs.push_back(PAL.getParamAttributes(ArgNo)); - PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttributes(), - PAL.getRetAttributes(), ArgAttrs); + ArgAttrs.push_back(PAL.getParamAttrs(ArgNo)); + PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttrs(), + PAL.getRetAttrs(), ArgAttrs); } SmallVector OpBundles; @@ -762,7 +762,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { if (LiveValues.erase(Arg)) { Params.push_back(I->getType()); ArgAlive[ArgI] = true; - ArgAttrVec.push_back(PAL.getParamAttributes(ArgI)); + ArgAttrVec.push_back(PAL.getParamAttrs(ArgI)); HasLiveReturnedArg |= PAL.hasParamAttr(ArgI, Attribute::Returned); } else { ++NumArgumentsEliminated; @@ -838,7 +838,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { assert(NRetTy && "No new return type found?"); // The existing function return attributes. - AttrBuilder RAttrs(PAL.getRetAttributes()); + AttrBuilder RAttrs(PAL.getRetAttrs()); // Remove any incompatible attributes, but only if we removed all return // values. Otherwise, ensure that we don't have any conflicting attributes @@ -853,8 +853,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); // Strip allocsize attributes. They might refer to the deleted arguments. - AttributeSet FnAttrs = PAL.getFnAttributes().removeAttribute( - F->getContext(), Attribute::AllocSize); + AttributeSet FnAttrs = + PAL.getFnAttrs().removeAttribute(F->getContext(), Attribute::AllocSize); // Reconstruct the AttributesList based on the vector we constructed. assert(ArgAttrVec.size() == Params.size()); @@ -889,7 +889,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { // Adjust the call return attributes in case the function was changed to // return void. - AttrBuilder RAttrs(CallPAL.getRetAttributes()); + AttrBuilder RAttrs(CallPAL.getRetAttrs()); RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy)); AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); @@ -903,7 +903,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { if (ArgAlive[Pi]) { Args.push_back(*I); // Get original parameter attributes, but skip return attributes. - AttributeSet Attrs = CallPAL.getParamAttributes(Pi); + AttributeSet Attrs = CallPAL.getParamAttrs(Pi); if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) { // If the return type has changed, then get rid of 'returned' on the // call site. The alternative is to make all 'returned' attributes on @@ -922,7 +922,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { // Push any varargs arguments on the list. Don't forget their attributes. for (auto E = CB.arg_end(); I != E; ++I, ++Pi) { Args.push_back(*I); - ArgAttrVec.push_back(CallPAL.getParamAttributes(Pi)); + ArgAttrVec.push_back(CallPAL.getParamAttrs(Pi)); } // Reconstruct the AttributesList based on the vector we constructed. @@ -930,7 +930,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { // Again, be sure to remove any allocsize attributes, since their indices // may now be incorrect. - AttributeSet FnAttrs = CallPAL.getFnAttributes().removeAttribute( + AttributeSet FnAttrs = CallPAL.getFnAttrs().removeAttribute( F->getContext(), Attribute::AllocSize); AttributeList NewCallPAL = AttributeList::get( diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index adf9ffba57801..318c1494445d5 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -1231,8 +1231,7 @@ static void fillOverallFunction(Module &M, OutlinableGroup &CurrentGroup, *CurrentGroup.OutlinedFunction); // Transfer the attributes from the function to the new function. - for (Attribute A : - CurrentOS->ExtractedFunction->getAttributes().getFnAttributes()) + for (Attribute A : CurrentOS->ExtractedFunction->getAttributes().getFnAttrs()) CurrentGroup.OutlinedFunction->addFnAttr(A); // Create an output block for the first extracted function. diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 6c225118b7394..b69d06ce2587f 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1063,7 +1063,7 @@ struct OpenMPOpt { // Forward parameter attributes from the callback to the callee. for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands(); U < E; ++U) - for (const Attribute &A : CI->getAttributes().getParamAttributes(U)) + for (const Attribute &A : CI->getAttributes().getParamAttrs(U)) NewCI->addParamAttr( U - (CallbackFirstArgOperand - CallbackCalleeOperand), A); diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index eea848d3eb2f3..b7734b427431d 100644 --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -181,9 +181,9 @@ void simplifyExternals(Module &M) { F.getAddressSpace(), "", &M); NewF->copyAttributesFrom(&F); // Only copy function attribtues. - NewF->setAttributes( - AttributeList::get(M.getContext(), AttributeList::FunctionIndex, - F.getAttributes().getFnAttributes())); + NewF->setAttributes(AttributeList::get(M.getContext(), + AttributeList::FunctionIndex, + F.getAttributes().getFnAttrs())); NewF->takeName(&F); F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType())); F.eraseFromParent(); diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 7a8946110785a..1cac59b482372 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -1361,10 +1361,10 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo, M.getContext(), ArrayRef{Attribute::get( M.getContext(), Attribute::Nest)})); for (unsigned I = 0; I + 2 < Attrs.getNumAttrSets(); ++I) - NewArgAttrs.push_back(Attrs.getParamAttributes(I)); + NewArgAttrs.push_back(Attrs.getParamAttrs(I)); NewCS->setAttributes( - AttributeList::get(M.getContext(), Attrs.getFnAttributes(), - Attrs.getRetAttributes(), NewArgAttrs)); + AttributeList::get(M.getContext(), Attrs.getFnAttrs(), + Attrs.getRetAttrs(), NewArgAttrs)); CB.replaceAllUsesWith(NewCS); CB.eraseFromParent(); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 84b69695ac510..a532c8e468683 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2829,7 +2829,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL)) return false; // Cannot transform this parameter value. - if (AttrBuilder(CallerPAL.getParamAttributes(i)) + if (AttrBuilder(CallerPAL.getParamAttrs(i)) .overlaps(AttributeFuncs::typeIncompatible(ParamTy))) return false; // Attribute not compatible with transformed value. @@ -2912,11 +2912,11 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { // Add any parameter attributes. if (CallerPAL.hasParamAttr(i, Attribute::ByVal)) { - AttrBuilder AB(CallerPAL.getParamAttributes(i)); + AttrBuilder AB(CallerPAL.getParamAttrs(i)); AB.addByValAttr(NewArg->getType()->getPointerElementType()); ArgAttrs.push_back(AttributeSet::get(Ctx, AB)); } else - ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); + ArgAttrs.push_back(CallerPAL.getParamAttrs(i)); } // If the function takes more arguments than the call was taking, add them @@ -2943,12 +2943,12 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) { Args.push_back(NewArg); // Add any parameter attributes. - ArgAttrs.push_back(CallerPAL.getParamAttributes(i)); + ArgAttrs.push_back(CallerPAL.getParamAttrs(i)); } } } - AttributeSet FnAttrs = CallerPAL.getFnAttributes(); + AttributeSet FnAttrs = CallerPAL.getFnAttrs(); if (NewRetTy->isVoidTy()) Caller->setName(""); // Void type should not have a name. @@ -3049,7 +3049,7 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call, for (FunctionType::param_iterator I = NestFTy->param_begin(), E = NestFTy->param_end(); I != E; ++NestArgNo, ++I) { - AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo); + AttributeSet AS = NestAttrs.getParamAttrs(NestArgNo); if (AS.hasAttribute(Attribute::Nest)) { // Record the parameter type and any other attributes. NestTy = *I; @@ -3085,7 +3085,7 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call, // Add the original argument and attributes. NewArgs.push_back(*I); - NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); + NewArgAttrs.push_back(Attrs.getParamAttrs(ArgNo)); ++ArgNo; ++I; @@ -3131,8 +3131,8 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call, NestF : ConstantExpr::getBitCast(NestF, PointerType::getUnqual(NewFTy)); AttributeList NewPAL = - AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(), - Attrs.getRetAttributes(), NewArgAttrs); + AttributeList::get(FTy->getContext(), Attrs.getFnAttrs(), + Attrs.getRetAttrs(), NewArgAttrs); SmallVector OpBundles; Call.getOperandBundlesAsDefs(OpBundles); diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 63aa84e4a77cd..caa5b9908e456 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -349,18 +349,18 @@ transformFunctionAttributes(const TransformedFunction &TransformedFunction, for (unsigned I = 0, IE = TransformedFunction.ArgumentIndexMapping.size(); I < IE; ++I) { unsigned TransformedIndex = TransformedFunction.ArgumentIndexMapping[I]; - ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttributes(I); + ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttrs(I); } // Copy annotations on varargs arguments. for (unsigned I = TransformedFunction.OriginalType->getNumParams(), IE = CallSiteAttrs.getNumAttrSets(); I < IE; ++I) { - ArgumentAttributes.push_back(CallSiteAttrs.getParamAttributes(I)); + ArgumentAttributes.push_back(CallSiteAttrs.getParamAttrs(I)); } - return AttributeList::get(Ctx, CallSiteAttrs.getFnAttributes(), - CallSiteAttrs.getRetAttributes(), + return AttributeList::get(Ctx, CallSiteAttrs.getFnAttrs(), + CallSiteAttrs.getRetAttrs(), llvm::makeArrayRef(ArgumentAttributes)); } diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 4e755bab15f3a..385a9ec7457da 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4176,7 +4176,7 @@ struct VarArgAMD64Helper : public VarArgHelper { MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) { AMD64FpEndOffset = AMD64FpEndOffsetSSE; - for (const auto &Attr : F.getAttributes().getFnAttributes()) { + for (const auto &Attr : F.getAttributes().getFnAttrs()) { if (Attr.isStringAttribute() && (Attr.getKindAsString() == "target-features")) { if (Attr.getValueAsString().contains("-sse")) diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index bc0fecc972fc1..4be07558d42df 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1377,11 +1377,11 @@ static AttributeList legalizeCallAttributes(LLVMContext &Ctx, return AL; // Remove the readonly, readnone, and statepoint function attributes. - AttrBuilder FnAttrs = AL.getFnAttributes(); + AttrBuilder FnAttrs = AL.getFnAttrs(); for (auto Attr : FnAttrsToStrip) FnAttrs.removeAttribute(Attr); - for (Attribute A : AL.getFnAttributes()) { + for (Attribute A : AL.getFnAttrs()) { if (isStatepointDirectiveAttr(A)) FnAttrs.remove(A); } @@ -1801,7 +1801,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */ CallInst *GCResult = Builder.CreateGCResult(Token, Call->getType(), Name); GCResult->setAttributes( AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex, - Call->getAttributes().getRetAttributes())); + Call->getAttributes().getRetAttrs())); // We cannot RAUW or delete CS.getInstruction() because it could be in the // live set of some other safepoint, in which case that safepoint's diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp index d689e04da36f9..dd96c763ce7ce 100644 --- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp +++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp @@ -212,7 +212,7 @@ struct AssumeBuilderState { if (!IsPoisonAttr || Call->isPassingUndefUB(Idx - 1)) addAttribute(Attr, Call->getArgOperand(Idx - 1)); } - for (Attribute Attr : AttrList.getFnAttributes()) + for (Attribute Attr : AttrList.getFnAttrs()) addAttribute(Attr, nullptr); }; addAttrList(Call->getAttributes()); diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp index 87868251036c1..33cb8823086e0 100644 --- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -485,7 +485,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee, CB.setArgOperand(ArgNo, Cast); // Remove any incompatible attributes for the argument. - AttrBuilder ArgAttrs(CallerPAL.getParamAttributes(ArgNo)); + AttrBuilder ArgAttrs(CallerPAL.getParamAttrs(ArgNo)); ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy)); // If byval is used, this must be a pointer type, and the byval type must @@ -496,7 +496,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee, NewArgAttrs.push_back(AttributeSet::get(Ctx, ArgAttrs)); AttributeChanged = true; } else - NewArgAttrs.push_back(CallerPAL.getParamAttributes(ArgNo)); + NewArgAttrs.push_back(CallerPAL.getParamAttrs(ArgNo)); } // If the return type of the call site doesn't match that of the callee, cast @@ -511,7 +511,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee, // Set the new callsite attribute. if (AttributeChanged) - CB.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(), + CB.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttrs(), AttributeSet::get(Ctx, RAttrs), NewArgAttrs)); diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index 0ac9a5aaa425b..7ea799a3f6453 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -116,13 +116,13 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, for (const Argument &OldArg : OldFunc->args()) { if (Argument *NewArg = dyn_cast(VMap[&OldArg])) { NewArgAttrs[NewArg->getArgNo()] = - OldAttrs.getParamAttributes(OldArg.getArgNo()); + OldAttrs.getParamAttrs(OldArg.getArgNo()); } } NewFunc->setAttributes( - AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(), - OldAttrs.getRetAttributes(), NewArgAttrs)); + AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttrs(), + OldAttrs.getRetAttrs(), NewArgAttrs)); // Everything else beyond this point deals with function instructions, // so if we are dealing with a function declaration, we're done. diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 9edc52b535508..ccd62b7bbd41e 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -885,7 +885,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, // "target-features" attribute allowing it to be lowered. // FIXME: This should be changed to check to see if a specific // attribute can not be inherited. - for (const auto &Attr : oldFunction->getAttributes().getFnAttributes()) { + for (const auto &Attr : oldFunction->getAttributes().getFnAttrs()) { if (Attr.isStringAttribute()) { if (Attr.getKindAsString() == "thunk") continue; diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 792aa8208f276..34236f2697b7f 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -2102,7 +2102,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, for (unsigned i = CalledFunc->getFunctionType()->getNumParams(); i < CB.getNumArgOperands(); i++) { VarArgsToForward.push_back(CB.getArgOperand(i)); - VarArgsAttrs.push_back(CB.getAttributes().getParamAttributes(i)); + VarArgsAttrs.push_back(CB.getAttributes().getParamAttrs(i)); } bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false; @@ -2135,13 +2135,13 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, if (!Attrs.isEmpty() || !VarArgsAttrs.empty()) { for (unsigned ArgNo = 0; ArgNo < CI->getFunctionType()->getNumParams(); ++ArgNo) - ArgAttrs.push_back(Attrs.getParamAttributes(ArgNo)); + ArgAttrs.push_back(Attrs.getParamAttrs(ArgNo)); } // Add VarArg attributes. ArgAttrs.append(VarArgsAttrs.begin(), VarArgsAttrs.end()); - Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttributes(), - Attrs.getRetAttributes(), ArgAttrs); + Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttrs(), + Attrs.getRetAttrs(), ArgAttrs); // Add VarArgs to existing parameters. SmallVector Params(CI->arg_operands()); Params.append(VarArgsToForward.begin(), VarArgsToForward.end()); diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 608b77c582242..ddcb55984861f 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -577,9 +577,9 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) { if (SrcLen == 0) { // strncpy(x, "", y) -> memset(x, '\0', y) Align MemSetAlign = - CI->getAttributes().getParamAttributes(0).getAlignment().valueOrOne(); + CI->getAttributes().getParamAttrs(0).getAlignment().valueOrOne(); CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, MemSetAlign); - AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0)); + AttrBuilder ArgAttrs(CI->getAttributes().getParamAttrs(0)); NewCI->setAttributes(NewCI->getAttributes().addParamAttributes( CI->getContext(), 0, ArgAttrs)); return Dst; diff --git a/llvm/tools/bugpoint/CrashDebugger.cpp b/llvm/tools/bugpoint/CrashDebugger.cpp index 2601ee318f7db..d068656170a7f 100644 --- a/llvm/tools/bugpoint/CrashDebugger.cpp +++ b/llvm/tools/bugpoint/CrashDebugger.cpp @@ -375,7 +375,7 @@ bool ReduceCrashingFunctionAttributes::TestFuncAttrs( // Pass along the set of attributes that caused the crash. Attrs.clear(); - for (Attribute A : NewAttrs.getFnAttributes()) { + for (Attribute A : NewAttrs.getFnAttrs()) { Attrs.push_back(A); } return true; @@ -1232,7 +1232,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) { assert(Fn && "Could not find function?"); std::vector Attrs; - for (Attribute A : Fn->getAttributes().getFnAttributes()) + for (Attribute A : Fn->getAttributes().getFnAttrs()) Attrs.push_back(A); OldSize += Attrs.size(); diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp index a11b75fc881c9..6e24c150cae8f 100644 --- a/llvm/unittests/IR/IRBuilderTest.cpp +++ b/llvm/unittests/IR/IRBuilderTest.cpp @@ -289,13 +289,13 @@ TEST_F(IRBuilderTest, ConstrainedFP) { EXPECT_EQ(II->getIntrinsicID(), Intrinsic::experimental_constrained_fpext); // Verify attributes on the call are created automatically. - AttributeSet CallAttrs = II->getAttributes().getFnAttributes(); + AttributeSet CallAttrs = II->getAttributes().getFnAttrs(); EXPECT_EQ(CallAttrs.hasAttribute(Attribute::StrictFP), true); // Verify attributes on the containing function are created when requested. Builder.setConstrainedFPFunctionAttr(); AttributeList Attrs = BB->getParent()->getAttributes(); - AttributeSet FnAttrs = Attrs.getFnAttributes(); + AttributeSet FnAttrs = Attrs.getFnAttrs(); EXPECT_EQ(FnAttrs.hasAttribute(Attribute::StrictFP), true); // Verify the codepaths for setting and overriding the default metadata. @@ -392,8 +392,8 @@ TEST_F(IRBuilderTest, ConstrainedFPFunctionCall) { CallInst *FCall = Builder.CreateCall(Callee, None); // Check the attributes to verify the strictfp attribute is on the call. - EXPECT_TRUE(FCall->getAttributes().getFnAttributes().hasAttribute( - Attribute::StrictFP)); + EXPECT_TRUE( + FCall->getAttributes().getFnAttrs().hasAttribute(Attribute::StrictFP)); Builder.CreateRetVoid(); EXPECT_FALSE(verifyModule(*M)); From 10bbf417377437a66e2c1a575c4607c8de4c6dc0 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 13 Aug 2021 11:29:20 -0700 Subject: [PATCH 003/700] Add missed rename of getFnAttributes() -> getFnAttrs() --- llvm/tools/bugpoint-passes/TestPasses.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/tools/bugpoint-passes/TestPasses.cpp b/llvm/tools/bugpoint-passes/TestPasses.cpp index 6667cbe9255ec..1e669538ec33b 100644 --- a/llvm/tools/bugpoint-passes/TestPasses.cpp +++ b/llvm/tools/bugpoint-passes/TestPasses.cpp @@ -143,7 +143,7 @@ class CrashOnFunctionAttribute : public FunctionPass { } bool runOnFunction(Function &F) override { - AttributeSet A = F.getAttributes().getFnAttributes(); + AttributeSet A = F.getAttributes().getFnAttrs(); if (A.hasAttribute("bugpoint-crash")) abort(); return false; From 7b20e05c714e273ebe89d713cec61e5a022bbac6 Mon Sep 17 00:00:00 2001 From: zoecarver Date: Tue, 3 Aug 2021 13:05:20 -0700 Subject: [PATCH 004/700] [libcxx][ranges] Add `ranges::iota_view`. Differential Revision: https://reviews.llvm.org/D107396 --- libcxx/docs/Status/RangesPaper.csv | 2 +- libcxx/include/CMakeLists.txt | 1 + libcxx/include/__iterator/concepts.h | 2 + libcxx/include/__ranges/iota_view.h | 403 ++++++++++++++++++ libcxx/include/module.modulemap | 1 + libcxx/include/ranges | 8 + .../ranges/iota_view.module.verify.cpp | 16 + .../weakly_incrementable.compile.pass.cpp | 1 + .../range.iota.view/begin.pass.cpp | 61 +++ .../borrowing.compile.pass.cpp | 25 ++ .../range.iota.view/ctad.compile.pass.cpp | 54 +++ .../range.iota.view/ctor.default.pass.cpp | 38 ++ .../range.iota.view/ctor.first.last.pass.cpp | 49 +++ .../range.iota.view/ctor.value.bound.pass.cpp | 60 +++ .../range.iota.view/ctor.value.pass.cpp | 72 ++++ .../range.iota.view/end.pass.cpp | 82 ++++ .../range.iota.view/iterator/compare.pass.cpp | 86 ++++ .../iterator/ctor.default.pass.cpp | 34 ++ .../iterator/ctor.value.pass.cpp | 46 ++ .../iterator/decrement.pass.cpp | 67 +++ .../iterator/increment.pass.cpp | 70 +++ .../iterator/member_typedefs.compile.pass.cpp | 163 +++++++ .../range.iota.view/iterator/minus.pass.cpp | 179 ++++++++ .../iterator/minus_eq.pass.cpp | 91 ++++ .../range.iota.view/iterator/plus.pass.cpp | 88 ++++ .../range.iota.view/iterator/plus_eq.pass.cpp | 91 ++++ .../range.iota.view/iterator/star.pass.cpp | 106 +++++ .../iterator/subscript.pass.cpp | 67 +++ ...range_concept_conformance.compile.pass.cpp | 44 ++ .../sentinel/ctor.default.pass.cpp | 34 ++ .../sentinel/ctor.value.pass.cpp | 48 +++ .../range.iota.view/sentinel/eq.pass.cpp | 59 +++ .../range.iota.view/sentinel/minus.pass.cpp | 65 +++ .../range.iota.view/size.pass.cpp | 101 +++++ .../range.iota.view/type.compile.pass.cpp | 22 + .../range.factories/range.iota.view/types.h | 212 +++++++++ .../range.iota.view/views_iota.pass.cpp | 83 ++++ 37 files changed, 2630 insertions(+), 1 deletion(-) create mode 100644 libcxx/include/__ranges/iota_view.h create mode 100644 libcxx/test/libcxx/diagnostics/detail.headers/ranges/iota_view.module.verify.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/begin.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/borrowing.compile.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/ctad.compile.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/ctor.default.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/ctor.first.last.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.bound.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/end.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/iterator/compare.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.default.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.value.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/iterator/decrement.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/iterator/increment.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/iterator/member_typedefs.compile.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus_eq.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus_eq.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/iterator/star.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/iterator/subscript.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/range_concept_conformance.compile.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.default.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.value.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/eq.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/minus.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/size.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/type.compile.pass.cpp create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/types.h create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp diff --git a/libcxx/docs/Status/RangesPaper.csv b/libcxx/docs/Status/RangesPaper.csv index e174cda1a7ca2..77239c314f787 100644 --- a/libcxx/docs/Status/RangesPaper.csv +++ b/libcxx/docs/Status/RangesPaper.csv @@ -138,7 +138,7 @@ Section,Description,Dependencies,Assignee,Complete `[range.view.ref] `_,`ref-view `_,[view.interface],Zoe Carver,✅ `[range.filter] `_,filter_view,[range.all],Louis Dionne,Not started `[range.transform] `_,`transform_view `_,[range.all],Zoe Carver,✅ -`[range.iota] `_,iota_view,[range.all],Zoe Carver,In Progress +`[range.iota] `_,iota_view,[range.all],Zoe Carver,✅ `[range.take] `_,take_view,[range.all],Zoe Carver,✅ `[range.join] `_,join_view,[range.all],Zoe Carver,In Progress `[range.empty] `_,`empty_view `_,[view.interface],Zoe Carver,✅ diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index d2760ee45ad3c..914c74c7ceb60 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -223,6 +223,7 @@ set(files __ranges/empty.h __ranges/enable_borrowed_range.h __ranges/enable_view.h + __ranges/iota_view.h __ranges/non_propagating_cache.h __ranges/ref_view.h __ranges/reverse_view.h diff --git a/libcxx/include/__iterator/concepts.h b/libcxx/include/__iterator/concepts.h index 6eb4aef10528a..db836bda25391 100644 --- a/libcxx/include/__iterator/concepts.h +++ b/libcxx/include/__iterator/concepts.h @@ -72,6 +72,8 @@ concept __signed_integer_like = signed_integral<_Tp>; template concept weakly_incrementable = + // TODO: remove this once the clang bug is fixed (bugs.llvm.org/PR48173). + !same_as<_Ip, bool> && // Currently, clang does not handle bool correctly. movable<_Ip> && requires(_Ip __i) { typename iter_difference_t<_Ip>; diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h new file mode 100644 index 0000000000000..f302826b02474 --- /dev/null +++ b/libcxx/include/__ranges/iota_view.h @@ -0,0 +1,403 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef _LIBCPP___RANGES_IOTA_VIEW_H +#define _LIBCPP___RANGES_IOTA_VIEW_H + +#include <__config> +#include <__debug> +#include <__functional/ranges_operations.h> +#include <__iterator/concepts.h> +#include <__iterator/incrementable_traits.h> +#include <__iterator/iterator_traits.h> +#include <__iterator/unreachable_sentinel.h> +#include <__ranges/copyable_box.h> +#include <__ranges/enable_borrowed_range.h> +#include <__ranges/view_interface.h> +#include <__utility/forward.h> +#include <__utility/move.h> +#include +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +#pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +#if !defined(_LIBCPP_HAS_NO_RANGES) + +namespace ranges { + template + struct __get_wider_signed { + static auto __call() { + if constexpr (sizeof(_Int) < sizeof(short)) return type_identity{}; + else if constexpr (sizeof(_Int) < sizeof(int)) return type_identity{}; + else if constexpr (sizeof(_Int) < sizeof(long)) return type_identity{}; + else return type_identity{}; + + static_assert(sizeof(_Int) <= sizeof(long long), + "Found integer-like type that is bigger than largest integer like type."); + } + + using type = typename decltype(__call())::type; + }; + + template + using _IotaDiffT = typename _If< + (!integral<_Start> || sizeof(iter_difference_t<_Start>) > sizeof(_Start)), + type_identity>, + __get_wider_signed<_Start> + >::type; + + template + concept __decrementable = incrementable<_Iter> && requires(_Iter __i) { + { --__i } -> same_as<_Iter&>; + { __i-- } -> same_as<_Iter>; + }; + + template + concept __advanceable = + __decrementable<_Iter> && totally_ordered<_Iter> && + requires(_Iter __i, const _Iter __j, const _IotaDiffT<_Iter> __n) { + { __i += __n } -> same_as<_Iter&>; + { __i -= __n } -> same_as<_Iter&>; + _Iter(__j + __n); + _Iter(__n + __j); + _Iter(__j - __n); + { __j - __j } -> convertible_to<_IotaDiffT<_Iter>>; + }; + + template + struct __iota_iterator_category {}; + + template + struct __iota_iterator_category<_Tp> { + using iterator_category = input_iterator_tag; + }; + + template + requires __weakly_equality_comparable_with<_Start, _Bound> && copyable<_Start> + class iota_view : public view_interface> { + struct __iterator : public __iota_iterator_category<_Start> { + friend class iota_view; + + using iterator_concept = + _If<__advanceable<_Start>, random_access_iterator_tag, + _If<__decrementable<_Start>, bidirectional_iterator_tag, + _If, forward_iterator_tag, + /*Else*/ input_iterator_tag>>>; + + using value_type = _Start; + using difference_type = _IotaDiffT<_Start>; + + _Start __value_ = _Start(); + + _LIBCPP_HIDE_FROM_ABI + __iterator() requires default_initializable<_Start> = default; + + _LIBCPP_HIDE_FROM_ABI + constexpr explicit __iterator(_Start __value) : __value_(_VSTD::move(__value)) {} + + _LIBCPP_HIDE_FROM_ABI + constexpr _Start operator*() const noexcept(is_nothrow_copy_constructible_v<_Start>) { + return __value_; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator& operator++() { + ++__value_; + return *this; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr void operator++(int) { ++*this; } + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator operator++(int) requires incrementable<_Start> { + auto __tmp = *this; + ++*this; + return __tmp; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator& operator--() requires __decrementable<_Start> { + --__value_; + return *this; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator operator--(int) requires __decrementable<_Start> { + auto __tmp = *this; + --*this; + return __tmp; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator& operator+=(difference_type __n) + requires __advanceable<_Start> + { + if constexpr (__integer_like<_Start> && !__signed_integer_like<_Start>) { + if (__n >= difference_type(0)) { + __value_ += static_cast<_Start>(__n); + } else { + __value_ -= static_cast<_Start>(-__n); + } + } else { + __value_ += __n; + } + return *this; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator& operator-=(difference_type __n) + requires __advanceable<_Start> + { + if constexpr (__integer_like<_Start> && !__signed_integer_like<_Start>) { + if (__n >= difference_type(0)) { + __value_ -= static_cast<_Start>(__n); + } else { + __value_ += static_cast<_Start>(-__n); + } + } else { + __value_ -= __n; + } + return *this; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr _Start operator[](difference_type __n) const + requires __advanceable<_Start> + { + return _Start(__value_ + __n); + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr bool operator==(const __iterator& __x, const __iterator& __y) + requires equality_comparable<_Start> + { + return __x.__value_ == __y.__value_; + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr bool operator<(const __iterator& __x, const __iterator& __y) + requires totally_ordered<_Start> + { + return __x.__value_ < __y.__value_; + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr bool operator>(const __iterator& __x, const __iterator& __y) + requires totally_ordered<_Start> + { + return __y < __x; + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr bool operator<=(const __iterator& __x, const __iterator& __y) + requires totally_ordered<_Start> + { + return !(__y < __x); + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr bool operator>=(const __iterator& __x, const __iterator& __y) + requires totally_ordered<_Start> + { + return !(__x < __y); + } + +// friend constexpr auto operator<=>(const __iterator& __x, const __iterator& __y) +// requires totally_ordered<_Start> && three_way_comparable<_Start> +// { +// return __x.__value_ <=> __y.__value_; +// } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr __iterator operator+(__iterator __i, difference_type __n) + requires __advanceable<_Start> + { + __i += __n; + return __i; + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr __iterator operator+(difference_type __n, __iterator __i) + requires __advanceable<_Start> + { + return __i + __n; + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr __iterator operator-(__iterator __i, difference_type __n) + requires __advanceable<_Start> + { + __i -= __n; + return __i; + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr difference_type operator-(const __iterator& __x, const __iterator& __y) + requires __advanceable<_Start> + { + if constexpr (__integer_like<_Start>) { + if constexpr (__signed_integer_like<_Start>) { + return difference_type(difference_type(__x.__value_) - difference_type(__y.__value_)); + } + if (__y.__value_ > __x.__value_) { + return difference_type(-difference_type(__y.__value_ - __x.__value_)); + } + return difference_type(__x.__value_ - __y.__value_); + } + return __x.__value_ - __y.__value_; + } + }; + + struct __sentinel { + friend class iota_view; + + private: + _Bound __bound_ = _Bound(); + + public: + _LIBCPP_HIDE_FROM_ABI + __sentinel() = default; + constexpr explicit __sentinel(_Bound __bound) : __bound_(_VSTD::move(__bound)) {} + + _LIBCPP_HIDE_FROM_ABI + friend constexpr bool operator==(const __iterator& __x, const __sentinel& __y) { + return __x.__value_ == __y.__bound_; + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr iter_difference_t<_Start> operator-(const __iterator& __x, const __sentinel& __y) + requires sized_sentinel_for<_Bound, _Start> + { + return __x.__value_ - __y.__bound_; + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr iter_difference_t<_Start> operator-(const __sentinel& __x, const __iterator& __y) + requires sized_sentinel_for<_Bound, _Start> + { + return -(__y - __x); + } + }; + + _Start __value_ = _Start(); + _Bound __bound_ = _Bound(); + + public: + _LIBCPP_HIDE_FROM_ABI + iota_view() requires default_initializable<_Start> = default; + + _LIBCPP_HIDE_FROM_ABI + constexpr explicit iota_view(_Start __value) : __value_(_VSTD::move(__value)) { } + + _LIBCPP_HIDE_FROM_ABI + constexpr iota_view(type_identity_t<_Start> __value, type_identity_t<_Bound> __bound) + : __value_(_VSTD::move(__value)), __bound_(_VSTD::move(__bound)) { + // Validate the precondition if possible. + if constexpr (totally_ordered_with<_Start, _Bound>) { + _LIBCPP_ASSERT(ranges::less_equal()(__value_, __bound_), + "Precondition violated: value is greater than bound."); + } + } + + _LIBCPP_HIDE_FROM_ABI + constexpr iota_view(__iterator __first, __iterator __last) + requires same_as<_Start, _Bound> + : iota_view(_VSTD::move(__first.__value_), _VSTD::move(__last.__value_)) {} + + _LIBCPP_HIDE_FROM_ABI + constexpr iota_view(__iterator __first, _Bound __last) + requires same_as<_Bound, unreachable_sentinel_t> + : iota_view(_VSTD::move(__first.__value_), _VSTD::move(__last)) {} + + _LIBCPP_HIDE_FROM_ABI + constexpr iota_view(__iterator __first, __sentinel __last) + requires (!same_as<_Start, _Bound> && !same_as<_Start, unreachable_sentinel_t>) + : iota_view(_VSTD::move(__first.__value_), _VSTD::move(__last.__bound_)) {} + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator begin() const { return __iterator{__value_}; } + + _LIBCPP_HIDE_FROM_ABI + constexpr auto end() const { + if constexpr (same_as<_Bound, unreachable_sentinel_t>) + return unreachable_sentinel; + else + return __sentinel{__bound_}; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator end() const requires same_as<_Start, _Bound> { + return __iterator{__bound_}; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr auto size() const + requires (same_as<_Start, _Bound> && __advanceable<_Start>) || + (integral<_Start> && integral<_Bound>) || + sized_sentinel_for<_Bound, _Start> + { + if constexpr (__integer_like<_Start> && __integer_like<_Bound>) { + if (__value_ < 0) { + if (__bound_ < 0) { + return _VSTD::__to_unsigned_like(-__value_) - _VSTD::__to_unsigned_like(-__bound_); + } + return _VSTD::__to_unsigned_like(__bound_) + _VSTD::__to_unsigned_like(-__value_); + } + return _VSTD::__to_unsigned_like(__bound_) - _VSTD::__to_unsigned_like(__value_); + } + return _VSTD::__to_unsigned_like(__bound_ - __value_); + } + }; + + template + requires (!__integer_like<_Start> || !__integer_like<_Bound> || + (__signed_integer_like<_Start> == __signed_integer_like<_Bound>)) + iota_view(_Start, _Bound) -> iota_view<_Start, _Bound>; + + template + inline constexpr bool enable_borrowed_range> = true; +} // namespace ranges + +namespace views { +namespace __iota { + struct __fn { + template + _LIBCPP_HIDE_FROM_ABI + constexpr auto operator()(_Start&& __start) const + noexcept(noexcept(ranges::iota_view(_VSTD::forward<_Start>(__start)))) + -> decltype(ranges::iota_view(_VSTD::forward<_Start>(__start))) + { + return ranges::iota_view(_VSTD::forward<_Start>(__start)); + } + + template + _LIBCPP_HIDE_FROM_ABI + constexpr auto operator()(_Start&& __start, _Bound&& __bound) const + noexcept(noexcept(ranges::iota_view(_VSTD::forward<_Start>(__start), _VSTD::forward<_Bound>(__bound)))) + -> decltype(ranges::iota_view(_VSTD::forward<_Start>(__start), _VSTD::forward<_Bound>(__bound))) + { + return ranges::iota_view(_VSTD::forward<_Start>(__start), _VSTD::forward<_Bound>(__bound)); + } + }; +} // namespace __iota + +inline namespace __cpo { + inline constexpr auto iota = __iota::__fn{}; +} +} // namespace views + +#endif // !defined(_LIBCPP_HAS_NO_RANGES) + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___RANGES_IOTA_VIEW_H diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index f7c899c5fe535..079dff201804b 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -658,6 +658,7 @@ module std [system] { module empty_view { private header "__ranges/empty_view.h" } module enable_borrowed_range { private header "__ranges/enable_borrowed_range.h" } module enable_view { private header "__ranges/enable_view.h" } + module iota_view { private header "__ranges/iota_view.h" } module non_propagating_cache { private header "__ranges/non_propagating_cache.h" } module ref_view { private header "__ranges/ref_view.h" } module reverse_view { private header "__ranges/reverse_view.h" } diff --git a/libcxx/include/ranges b/libcxx/include/ranges index 49e79647739c8..df8d4194ffa14 100644 --- a/libcxx/include/ranges +++ b/libcxx/include/ranges @@ -177,6 +177,13 @@ namespace std::ranges { template requires is_object_v class single_view; + + template + requires weakly-equality-comparable-with && copyable + class iota_view; + + template + inline constexpr bool enable_borrowed_range> = true; } */ @@ -199,6 +206,7 @@ namespace std::ranges { #include <__ranges/empty_view.h> #include <__ranges/enable_borrowed_range.h> #include <__ranges/enable_view.h> +#include <__ranges/iota_view.h> #include <__ranges/ref_view.h> #include <__ranges/reverse_view.h> #include <__ranges/take_view.h> diff --git a/libcxx/test/libcxx/diagnostics/detail.headers/ranges/iota_view.module.verify.cpp b/libcxx/test/libcxx/diagnostics/detail.headers/ranges/iota_view.module.verify.cpp new file mode 100644 index 0000000000000..8b2d4cad2da25 --- /dev/null +++ b/libcxx/test/libcxx/diagnostics/detail.headers/ranges/iota_view.module.verify.cpp @@ -0,0 +1,16 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: modules-build + +// WARNING: This test was generated by 'generate_private_header_tests.py' +// and should not be edited manually. + +// expected-error@*:* {{use of private header from outside its module: '__ranges/iota_view.h'}} +#include <__ranges/iota_view.h> diff --git a/libcxx/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.winc/weakly_incrementable.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.winc/weakly_incrementable.compile.pass.cpp index a3198357bbb76..5fad38d4d6cfd 100644 --- a/libcxx/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.winc/weakly_incrementable.compile.pass.cpp +++ b/libcxx/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.winc/weakly_incrementable.compile.pass.cpp @@ -30,6 +30,7 @@ static_assert(!std::weakly_incrementable); static_assert(!std::weakly_incrementable); static_assert(!std::weakly_incrementable); static_assert(!std::weakly_incrementable); +static_assert(!std::weakly_incrementable); struct S {}; static_assert(!std::weakly_incrementable); diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/begin.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/begin.pass.cpp new file mode 100644 index 0000000000000..16b80b19ef3b3 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/begin.pass.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr iterator begin() const; + +#include +#include + +#include "test_macros.h" +#include "types.h" + +template +constexpr void testType() { + { + std::ranges::iota_view io(T(0)); + assert(*io.begin() == T(0)); + } + { + std::ranges::iota_view io(T(10)); + assert(*io.begin() == T(10)); + assert(*std::move(io).begin() == T(10)); + } + { + const std::ranges::iota_view io(T(0)); + assert(*io.begin() == T(0)); + } + { + const std::ranges::iota_view io(T(10)); + assert(*io.begin() == T(10)); + } +} + +constexpr bool test() { + testType(); + testType(); + testType(); + testType(); + testType(); + testType(); + testType(); + testType(); + testType(); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/borrowing.compile.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/borrowing.compile.pass.cpp new file mode 100644 index 0000000000000..7fbf987463652 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/borrowing.compile.pass.cpp @@ -0,0 +1,25 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// template +// inline constexpr bool enable_borrowed_range> = true; + +#include +#include +#include + +#include "test_macros.h" +#include "types.h" + +static_assert(std::ranges::enable_borrowed_range>); +static_assert(std::ranges::enable_borrowed_range>); +static_assert(std::ranges::enable_borrowed_range>>); diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/ctad.compile.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/ctad.compile.pass.cpp new file mode 100644 index 0000000000000..d1f4e5a5f4503 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/ctad.compile.pass.cpp @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// template +// requires (!is-integer-like || !is-integer-like || +// (is-signed-integer-like == is-signed-integer-like)) +// iota_view(W, Bound) -> iota_view; + +#include +#include +#include + +#include "test_macros.h" +#include "types.h" + +template +concept CanDeduce = requires(const T& t, const U& u) { + std::ranges::iota_view(t, u); +}; + +void test() { + static_assert(std::same_as< + decltype(std::ranges::iota_view(0, 0)), + std::ranges::iota_view + >); + + static_assert(std::same_as< + decltype(std::ranges::iota_view(0)), + std::ranges::iota_view + >); + + static_assert(std::same_as< + decltype(std::ranges::iota_view(0, std::unreachable_sentinel)), + std::ranges::iota_view + >); + + static_assert(std::same_as< + decltype(std::ranges::iota_view(0, IntComparableWith(0))), + std::ranges::iota_view> + >); + + static_assert( CanDeduce); + static_assert(!CanDeduce); + static_assert(!CanDeduce); +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.default.pass.cpp new file mode 100644 index 0000000000000..724656429f7d6 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.default.pass.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// iota_view() requires default_­initializable = default; + +#include +#include + +#include "test_macros.h" +#include "types.h" + +constexpr bool test() { + { + std::ranges::iota_view> io; + assert((*io.begin()).value_ == 42); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + static_assert(!std::default_initializable>); + static_assert( std::default_initializable>); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.first.last.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.first.last.pass.cpp new file mode 100644 index 0000000000000..61eb8a2d16732 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.first.last.pass.cpp @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr iota_view(iterator first, see below last); + +#include +#include + +#include "test_macros.h" +#include "types.h" + +constexpr bool test() { + { + std::ranges::iota_view commonView(SomeInt(0), SomeInt(10)); + std::ranges::iota_view io(commonView.begin(), commonView.end()); + assert(std::ranges::next(io.begin(), 10) == io.end()); + } + + { + std::ranges::iota_view unreachableSent(SomeInt(0)); + std::ranges::iota_view io(unreachableSent.begin(), std::unreachable_sentinel); + assert(std::ranges::next(io.begin(), 10) != io.end()); + } + + { + std::ranges::iota_view differentTypes(SomeInt(0), IntComparableWith(SomeInt(10))); + std::ranges::iota_view> io(differentTypes.begin(), differentTypes.end()); + assert(std::ranges::next(io.begin(), 10) == io.end()); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} + diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.bound.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.bound.pass.cpp new file mode 100644 index 0000000000000..21f5558f61d81 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.bound.pass.cpp @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// ADDITIONAL_COMPILE_FLAGS: -Wno-sign-compare + +// constexpr iota_view(type_identity_t value, type_identity_t bound); + +#include +#include + +#include "test_macros.h" +#include "types.h" + +constexpr bool test() { + { + std::ranges::iota_view io(SomeInt(0), SomeInt(10)); + assert(std::ranges::next(io.begin(), 10) == io.end()); + } + + { + std::ranges::iota_view io(SomeInt(0), std::unreachable_sentinel); + assert(std::ranges::next(io.begin(), 10) != io.end()); + } + + { + std::ranges::iota_view> io(SomeInt(0), IntComparableWith(SomeInt(10))); + assert(std::ranges::next(io.begin(), 10) == io.end()); + } + + { + // This is allowed only when using the constructor (not the deduction guide). + std::ranges::iota_view signedUnsigned(0, 10); + assert(std::ranges::next(signedUnsigned.begin(), 10) == signedUnsigned.end()); + } + + { + // This is allowed only when using the constructor (not the deduction guide). + std::ranges::iota_view signedUnsigned(0, 10); + assert(std::ranges::next(signedUnsigned.begin(), 10) == signedUnsigned.end()); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} + diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.pass.cpp new file mode 100644 index 0000000000000..cc65b9368a2f9 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.pass.cpp @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr explicit iota_view(W value); + +#include +#include + +#include "test_macros.h" +#include "types.h" + +struct SomeIntComparable { + using difference_type = int; + + SomeInt value_; + constexpr SomeIntComparable() : value_(SomeInt(10)) {} + + friend constexpr bool operator==(SomeIntComparable lhs, SomeIntComparable rhs) { + return lhs.value_ == rhs.value_; + } + friend constexpr bool operator==(SomeIntComparable lhs, SomeInt rhs) { + return lhs.value_ == rhs; + } + friend constexpr bool operator==(SomeInt lhs, SomeIntComparable rhs) { + return lhs == rhs.value_; + } + + friend constexpr difference_type operator-(SomeIntComparable lhs, SomeIntComparable rhs) { + return lhs.value_ - rhs.value_; + } + + constexpr SomeIntComparable& operator++() { ++value_; return *this; } + constexpr SomeIntComparable operator++(int) { auto tmp = *this; ++value_; return tmp; } + constexpr SomeIntComparable operator--() { --value_; return *this; } +}; + +constexpr bool test() { + { + std::ranges::iota_view io(SomeInt(42)); + assert((*io.begin()).value_ == 42); + // Check that end returns std::unreachable_sentinel. + assert(io.end() != io.begin()); + static_assert(std::same_as); + } + + { + std::ranges::iota_view io(SomeInt(0)); + assert(std::ranges::next(io.begin(), 10) == io.end()); + } + { + static_assert(!std::is_convertible_v, SomeInt>); + static_assert( std::is_constructible_v, SomeInt>); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/end.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/end.pass.cpp new file mode 100644 index 0000000000000..418a52a3b9d2f --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/end.pass.cpp @@ -0,0 +1,82 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// ADDITIONAL_COMPILE_FLAGS: -Wno-sign-compare + +// constexpr auto end() const; +// constexpr iterator end() const requires same_as; + +#include +#include + +#include "test_macros.h" +#include "types.h" + +template +constexpr void testType(U u) { + { + std::ranges::iota_view io(T(0), u); + assert(std::ranges::next(io.begin(), 10) == io.end()); + } + { + std::ranges::iota_view io(T(10), u); + assert(io.begin() == io.end()); + assert(io.begin() == std::move(io).end()); + } + { + const std::ranges::iota_view io(T(0), u); + assert(std::ranges::next(io.begin(), 10) == io.end()); + assert(std::ranges::next(io.begin(), 10) == std::move(io).end()); + } + { + const std::ranges::iota_view io(T(10), u); + assert(io.begin() == io.end()); + } + + { + std::ranges::iota_view io(T(0), std::unreachable_sentinel); + assert(io.begin() != io.end()); + assert(std::ranges::next(io.begin()) != io.end()); + assert(std::ranges::next(io.begin(), 10) != io.end()); + } + { + const std::ranges::iota_view io(T(0), std::unreachable_sentinel); + assert(io.begin() != io.end()); + assert(std::ranges::next(io.begin()) != io.end()); + assert(std::ranges::next(io.begin(), 10) != io.end()); + } +} + +constexpr bool test() { + testType(SomeInt(10)); + testType(IntComparableWith(SomeInt(10))); + testType(IntComparableWith(10)); + testType(IntComparableWith(10)); + testType(IntComparableWith(10)); + testType(int(10)); + testType(unsigned(10)); + testType(unsigned(10)); + testType(int(10)); + testType(IntComparableWith(10)); + testType(short(10)); + testType(IntComparableWith(10)); + testType(IntComparableWith(10)); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/compare.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/compare.pass.cpp new file mode 100644 index 0000000000000..6ef4751c64408 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/compare.pass.cpp @@ -0,0 +1,86 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// friend constexpr bool operator<(const iterator& x, const iterator& y) +// requires totally_ordered; +// friend constexpr bool operator>(const iterator& x, const iterator& y) +// requires totally_ordered; +// friend constexpr bool operator<=(const iterator& x, const iterator& y) +// requires totally_ordered; +// friend constexpr bool operator>=(const iterator& x, const iterator& y) +// requires totally_ordered; +// friend constexpr bool operator==(const iterator& x, const iterator& y) +// requires equality_comparable; + +// TODO: test spaceship operator once it's implemented. + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + { + const std::ranges::iota_view io(0); + assert( io.begin() == io.begin() ); + assert( io.begin() != std::ranges::next(io.begin())); + assert( io.begin() < std::ranges::next(io.begin())); + assert(std::ranges::next(io.begin()) > io.begin() ); + assert( io.begin() <= std::ranges::next(io.begin())); + assert(std::ranges::next(io.begin()) >= io.begin() ); + assert( io.begin() <= io.begin() ); + assert( io.begin() >= io.begin() ); + } + { + std::ranges::iota_view io(0); + assert( io.begin() == io.begin() ); + assert( io.begin() != std::ranges::next(io.begin())); + assert( io.begin() < std::ranges::next(io.begin())); + assert(std::ranges::next(io.begin()) > io.begin() ); + assert( io.begin() <= std::ranges::next(io.begin())); + assert(std::ranges::next(io.begin()) >= io.begin() ); + assert( io.begin() <= io.begin() ); + assert( io.begin() >= io.begin() ); + } + { + const std::ranges::iota_view io(SomeInt(0)); + assert( io.begin() == io.begin() ); + assert( io.begin() != std::ranges::next(io.begin())); + assert( io.begin() < std::ranges::next(io.begin())); + assert(std::ranges::next(io.begin()) > io.begin() ); + assert( io.begin() <= std::ranges::next(io.begin())); + assert(std::ranges::next(io.begin()) >= io.begin() ); + assert( io.begin() <= io.begin() ); + assert( io.begin() >= io.begin() ); + } + { + std::ranges::iota_view io(SomeInt(0)); + assert( io.begin() == io.begin() ); + assert( io.begin() != std::ranges::next(io.begin())); + assert( io.begin() < std::ranges::next(io.begin())); + assert(std::ranges::next(io.begin()) > io.begin() ); + assert( io.begin() <= std::ranges::next(io.begin())); + assert(std::ranges::next(io.begin()) >= io.begin() ); + assert( io.begin() <= io.begin() ); + assert( io.begin() >= io.begin() ); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.default.pass.cpp new file mode 100644 index 0000000000000..ac8075dba9143 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.default.pass.cpp @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// iterator() requires default_initializable = default; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + using Iter = std::ranges::iterator_t>>; + Iter iter; + assert((*iter).value_ == 42); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.value.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.value.pass.cpp new file mode 100644 index 0000000000000..8395dde50df4f --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.value.pass.cpp @@ -0,0 +1,46 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr explicit iterator(W value); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + { + using Iter = std::ranges::iterator_t>; + auto iter = Iter(42); + assert(*iter == 42); + } + { + using Iter = std::ranges::iterator_t>; + auto iter = Iter(SomeInt(42)); + assert(*iter == SomeInt(42)); + } + { + using Iter = std::ranges::iterator_t>; + static_assert(!std::is_convertible_v); + static_assert( std::is_constructible_v); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/decrement.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/decrement.pass.cpp new file mode 100644 index 0000000000000..a17dc3f99f472 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/decrement.pass.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr iterator& operator--() requires decrementable; +// constexpr iterator operator--(int) requires decrementable; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +template +concept Decrementable = + requires(T i) { + --i; + } || + requires(T i) { + i--; + }; + +constexpr bool test() { + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin()); + auto iter2 = std::next(io.begin()); + assert(iter1 == iter2); + assert(--iter1 != iter2--); + assert(iter1 == iter2); + + static_assert(!std::is_reference_v); + static_assert( std::is_reference_v); + static_assert(std::same_as, decltype(iter2--)>); + } + { + std::ranges::iota_view io(SomeInt(0)); + auto iter1 = std::next(io.begin()); + auto iter2 = std::next(io.begin()); + assert(iter1 == iter2); + assert(--iter1 != iter2--); + assert(iter1 == iter2); + + static_assert(!std::is_reference_v); + static_assert( std::is_reference_v); + static_assert(std::same_as, decltype(iter2--)>); + } + + static_assert(!Decrementable>); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/increment.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/increment.pass.cpp new file mode 100644 index 0000000000000..cbc143e20a0d6 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/increment.pass.cpp @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr iterator& operator++(); +// constexpr void operator++(int); +// constexpr iterator operator++(int) requires incrementable; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + { + std::ranges::iota_view io(0); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + assert(++iter1 != iter2++); + assert(iter1 == iter2); + + static_assert(!std::is_reference_v); + static_assert( std::is_reference_v); + static_assert(std::same_as, decltype(iter2++)>); + } + { + std::ranges::iota_view io(SomeInt(0)); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + assert(++iter1 != iter2++); + assert(iter1 == iter2); + + static_assert(!std::is_reference_v); + static_assert( std::is_reference_v); + static_assert(std::same_as, decltype(iter2++)>); + } + + { + std::ranges::iota_view io(NotIncrementable(0)); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + assert(++iter1 != iter2); + iter2++; + assert(iter1 == iter2); + + static_assert(std::same_as); + static_assert(std::is_reference_v); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/member_typedefs.compile.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/member_typedefs.compile.pass.cpp new file mode 100644 index 0000000000000..26d6bafbe5eed --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/member_typedefs.compile.pass.cpp @@ -0,0 +1,163 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// Test iterator category and iterator concepts. + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +struct Decrementable { + using difference_type = int; + + auto operator<=>(const Decrementable&) const = default; + + constexpr Decrementable& operator++(); + constexpr Decrementable operator++(int); + constexpr Decrementable& operator--(); + constexpr Decrementable operator--(int); +}; + +struct Incrementable { + using difference_type = int; + + auto operator<=>(const Incrementable&) const = default; + + constexpr Incrementable& operator++(); + constexpr Incrementable operator++(int); +}; + +struct BigType { + char buffer[128]; + + using difference_type = int; + + auto operator<=>(const BigType&) const = default; + + constexpr BigType& operator++(); + constexpr BigType operator++(int); +}; + +struct CharDifferenceType { + using difference_type = signed char; + + auto operator<=>(const CharDifferenceType&) const = default; + + constexpr CharDifferenceType& operator++(); + constexpr CharDifferenceType operator++(int); +}; + +template +concept HasIteratorCategory = requires { typename std::ranges::iterator_t::iterator_category; }; + +void test() { + { + const std::ranges::iota_view io(0); + using Iter = decltype(io.begin()); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(sizeof(Iter::difference_type) > sizeof(char)); + static_assert(std::is_signed_v); + LIBCPP_STATIC_ASSERT(std::same_as); + } + { + const std::ranges::iota_view io(0); + using Iter = decltype(io.begin()); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(sizeof(Iter::difference_type) > sizeof(short)); + static_assert(std::is_signed_v); + LIBCPP_STATIC_ASSERT(std::same_as); + } + { + const std::ranges::iota_view io(0); + using Iter = decltype(io.begin()); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(sizeof(Iter::difference_type) > sizeof(int)); + static_assert(std::is_signed_v); + // If we're compiling for 32 bit or windows, int and long are the same size, so long long is the correct difference type. +#if INTPTR_MAX == INT32_MAX || defined(_WIN32) + LIBCPP_STATIC_ASSERT(std::same_as); +#else + LIBCPP_STATIC_ASSERT(std::same_as); +#endif + } + { + const std::ranges::iota_view io(0); + using Iter = decltype(io.begin()); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + // Same as below, if there is no type larger than long, we can just use that. + static_assert(sizeof(Iter::difference_type) >= sizeof(long)); + static_assert(std::is_signed_v); + LIBCPP_STATIC_ASSERT(std::same_as); + } + { + const std::ranges::iota_view io(0); + using Iter = decltype(io.begin()); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + // No integer is larger than long long, so it is OK to use long long as the difference type here: + // https://eel.is/c++draft/range.iota.view#1.3 + static_assert(sizeof(Iter::difference_type) >= sizeof(long long)); + static_assert(std::is_signed_v); + LIBCPP_STATIC_ASSERT(std::same_as); + } + { + const std::ranges::iota_view io; + using Iter = decltype(io.begin()); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + } + { + const std::ranges::iota_view io; + using Iter = decltype(io.begin()); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + } + { + const std::ranges::iota_view io(NotIncrementable(0)); + using Iter = decltype(io.begin()); + static_assert(std::same_as); + static_assert(!HasIteratorCategory>); + static_assert(std::same_as); + static_assert(std::same_as); + } + { + const std::ranges::iota_view io; + using Iter = decltype(io.begin()); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + } + { + const std::ranges::iota_view io; + using Iter = decltype(io.begin()); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + static_assert(std::same_as); + } +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus.pass.cpp new file mode 100644 index 0000000000000..f4181801a948f --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus.pass.cpp @@ -0,0 +1,179 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// friend constexpr iterator operator-(iterator i, difference_type n) +// requires advanceable; +// friend constexpr difference_type operator-(const iterator& x, const iterator& y) +// requires advanceable; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +// If we're compiling for 32 bit or windows, int and long are the same size, so long long is the correct difference type. +#if INTPTR_MAX == INT32_MAX || defined(_WIN32) +using IntDiffT = long long; +#else +using IntDiffT = long; +#endif + +constexpr bool test() { + // - difference_type + { + // When "_Start" is signed integer like. + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 == iter2); + assert(iter1 - 5 != iter2); + assert(iter1 - 5 == std::ranges::prev(iter2, 5)); + + static_assert(!std::is_reference_v); + } + + // When "_Start" is not integer like. + { + std::ranges::iota_view io(SomeInt(0)); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 == iter2); + assert(iter1 - 5 != iter2); + assert(iter1 - 5 == std::ranges::prev(iter2, 5)); + + static_assert(!std::is_reference_v); + } + + // When "_Start" is unsigned integer like and n is greater than or equal to zero. + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 == iter2); + assert(iter1 - 5 != iter2); + assert(iter1 - 5 == std::ranges::prev(iter2, 5)); + + static_assert(!std::is_reference_v); + } + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 - 0 == iter2); + } + + // When "_Start" is unsigned integer like and n is less than zero. + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 - 5 != iter2); + assert(iter1 - 5 == std::ranges::prev(iter2, 5)); + + static_assert(!std::is_reference_v); + } + } + + // - + { + // When "_Start" is signed integer like. + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 5); + assert(iter1 - iter2 == 5); + + LIBCPP_STATIC_ASSERT(std::same_as); + } + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 - iter2 == 0); + + LIBCPP_STATIC_ASSERT(std::same_as); + } + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 5); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 - iter2 == -5); + + LIBCPP_STATIC_ASSERT(std::same_as); + } + + // When "_Start" is unsigned integer like and y > x. + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 5); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 - iter2 == -5); + + LIBCPP_STATIC_ASSERT(std::same_as); + } + + // When "_Start" is unsigned integer like and x >= y. + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 5); + assert(iter1 - iter2 == 5); + + LIBCPP_STATIC_ASSERT(std::same_as); + } + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 - iter2 == 0); + + LIBCPP_STATIC_ASSERT(std::same_as); + } + + // When "_Start" is not integer like. + { + std::ranges::iota_view io(SomeInt(0)); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 5); + assert(iter1 - iter2 == 5); + + static_assert(std::same_as); + } + { + std::ranges::iota_view io(SomeInt(0)); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 - iter2 == 0); + + static_assert(std::same_as); + } + { + std::ranges::iota_view io(SomeInt(0)); + auto iter1 = std::next(io.begin(), 5); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 - iter2 == -5); + + static_assert(std::same_as); + } + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus_eq.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus_eq.pass.cpp new file mode 100644 index 0000000000000..6616ebbb0a2ae --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus_eq.pass.cpp @@ -0,0 +1,91 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr iterator& operator-=(difference_type n) +// requires advanceable; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + // When "_Start" is signed integer like. + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 == iter2); + iter1 -= 5; + assert(iter1 != iter2); + assert(iter1 == std::ranges::prev(iter2, 5)); + + static_assert(std::is_reference_v); + } + + // When "_Start" is not integer like. + { + std::ranges::iota_view io(SomeInt(0)); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 == iter2); + iter1 -= 5; + assert(iter1 != iter2); + assert(iter1 == std::ranges::prev(iter2, 5)); + + static_assert(std::is_reference_v); + } + + // When "_Start" is unsigned integer like and n is greater than or equal to zero. + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 == iter2); + iter1 -= 5; + assert(iter1 != iter2); + assert(iter1 == std::ranges::prev(iter2, 5)); + + static_assert(std::is_reference_v); + } + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 == iter2); + iter1 -= 0; + assert(iter1 == iter2); + } + + // When "_Start" is unsigned integer like and n is less than zero. + { + std::ranges::iota_view io(0); + auto iter1 = std::next(io.begin(), 10); + auto iter2 = std::next(io.begin(), 10); + assert(iter1 == iter2); + iter1 -= -5; + assert(iter1 != iter2); + assert(iter1 == std::ranges::next(iter2, 5)); + + static_assert(std::is_reference_v); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus.pass.cpp new file mode 100644 index 0000000000000..b844900b3dd42 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus.pass.cpp @@ -0,0 +1,88 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// friend constexpr iterator operator+(iterator i, difference_type n) +// requires advanceable; +// friend constexpr iterator operator+(difference_type n, iterator i) +// requires advanceable; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + // When "_Start" is signed integer like. + { + std::ranges::iota_view io(0); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + assert(iter1 + 5 != iter2); + assert(iter1 + 5 == std::ranges::next(iter2, 5)); + + static_assert(std::is_reference_v); + } + + // When "_Start" is not integer like. + { + std::ranges::iota_view io(SomeInt(0)); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + assert(iter1 + 5 != iter2); + assert(iter1 + 5 == std::ranges::next(iter2, 5)); + + static_assert(std::is_reference_v); + } + + // When "_Start" is unsigned integer like and n is greater than or equal to zero. + { + std::ranges::iota_view io(0); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + assert(iter1 + 5 != iter2); + assert(iter1 + 5 == std::ranges::next(iter2, 5)); + + static_assert(std::is_reference_v); + } + { + std::ranges::iota_view io(0); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + assert(iter1 + 0 == iter2); + } + + // When "_Start" is unsigned integer like and n is less than zero. + { + std::ranges::iota_view io(0); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + assert(iter1 + 5 != iter2); + assert(iter1 + 5 == std::ranges::next(iter2, 5)); + + static_assert(std::is_reference_v); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus_eq.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus_eq.pass.cpp new file mode 100644 index 0000000000000..db3e6bdbcf287 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus_eq.pass.cpp @@ -0,0 +1,91 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr iterator& operator+=(difference_type n) +// requires advanceable; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + // When "_Start" is signed integer like. + { + std::ranges::iota_view io(0); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + iter1 += 5; + assert(iter1 != iter2); + assert(iter1 == std::ranges::next(iter2, 5)); + + static_assert(std::is_reference_v); + } + + // When "_Start" is not integer like. + { + std::ranges::iota_view io(SomeInt(0)); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + iter1 += 5; + assert(iter1 != iter2); + assert(iter1 == std::ranges::next(iter2, 5)); + + static_assert(std::is_reference_v); + } + + // When "_Start" is unsigned integer like and n is greater than or equal to zero. + { + std::ranges::iota_view io(0); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + iter1 += 5; + assert(iter1 != iter2); + assert(iter1 == std::ranges::next(iter2, 5)); + + static_assert(std::is_reference_v); + } + { + std::ranges::iota_view io(0); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + iter1 += 0; + assert(iter1 == iter2); + } + + // When "_Start" is unsigned integer like and n is less than zero. + { + std::ranges::iota_view io(0); + auto iter1 = io.begin(); + auto iter2 = io.begin(); + assert(iter1 == iter2); + iter1 += 5; + assert(iter1 != iter2); + assert(iter1 == std::ranges::next(iter2, 5)); + + static_assert(std::is_reference_v); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/star.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/star.pass.cpp new file mode 100644 index 0000000000000..492d7333fa9a0 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/star.pass.cpp @@ -0,0 +1,106 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// ADDITIONAL_COMPILE_FLAGS: -Wno-sign-compare + +// constexpr W operator*() const noexcept(is_nothrow_copy_constructible_v); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +struct NotNoexceptCopy { + using difference_type = int; + + int value_; + constexpr explicit NotNoexceptCopy(int value = 0) : value_(value) {} + NotNoexceptCopy(const NotNoexceptCopy&) noexcept(false) = default; + + bool operator==(const NotNoexceptCopy&) const = default; + + friend constexpr NotNoexceptCopy& operator+=(NotNoexceptCopy &lhs, const NotNoexceptCopy& rhs) { + lhs.value_ += rhs.value_; return lhs; + } + friend constexpr NotNoexceptCopy& operator-=(NotNoexceptCopy &lhs, const NotNoexceptCopy& rhs) { + lhs.value_ -= rhs.value_; return lhs; + } + + friend constexpr NotNoexceptCopy operator+(NotNoexceptCopy lhs, NotNoexceptCopy rhs) { + return NotNoexceptCopy{lhs.value_ + rhs.value_}; + } + friend constexpr int operator-(NotNoexceptCopy lhs, NotNoexceptCopy rhs) { + return lhs.value_ - rhs.value_; + } + + constexpr NotNoexceptCopy& operator++() { ++value_; return *this; } + constexpr void operator++(int) { ++value_; } +}; + +template +constexpr void testType() { + { + std::ranges::iota_view io(T(0)); + auto iter = io.begin(); + for (int i = 0; i < 100; ++i, ++iter) + assert(*iter == T(i)); + + static_assert(noexcept(*iter) == !std::same_as); + } + { + std::ranges::iota_view io(T(10)); + auto iter = io.begin(); + for (int i = 10; i < 100; ++i, ++iter) + assert(*iter == T(i)); + } + { + const std::ranges::iota_view io(T(0)); + auto iter = io.begin(); + for (int i = 0; i < 100; ++i, ++iter) + assert(*iter == T(i)); + } + { + const std::ranges::iota_view io(T(10)); + auto iter = io.begin(); + for (int i = 10; i < 100; ++i, ++iter) + assert(*iter == T(i)); + } +} + +constexpr bool test() { + testType(); + testType(); + testType(); + testType(); + testType(); + testType(); + testType(); + testType(); + + // Tests a mix of signed unsigned types. + { + const std::ranges::iota_view io(0, 10); + auto iter = io.begin(); + for (int i = 0; i < 10; ++i, ++iter) + assert(*iter == i); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/subscript.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/subscript.pass.cpp new file mode 100644 index 0000000000000..0fe4ea7048ca7 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/subscript.pass.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr W operator[](difference_type n) const +// requires advanceable; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +template +constexpr void testType() { + { + std::ranges::iota_view io(T(0)); + auto iter = io.begin(); + for (int i = 0; i < 100; ++i) + assert(iter[i] == T(i)); + } + { + std::ranges::iota_view io(T(10)); + auto iter = io.begin(); + for (int i = 0; i < 100; ++i) + assert(iter[i] == T(i + 10)); + } + { + const std::ranges::iota_view io(T(0)); + auto iter = io.begin(); + for (int i = 0; i < 100; ++i) + assert(iter[i] == T(i)); + } + { + const std::ranges::iota_view io(T(10)); + auto iter = io.begin(); + for (int i = 0; i < 100; ++i) + assert(iter[i] == T(i + 10)); + } +} + +constexpr bool test() { + testType(); + testType(); + testType(); + testType(); + testType(); + testType(); + testType(); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/range_concept_conformance.compile.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/range_concept_conformance.compile.pass.cpp new file mode 100644 index 0000000000000..b125f76de9e49 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/range_concept_conformance.compile.pass.cpp @@ -0,0 +1,44 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// Test that iota_view conforms to range and view concepts. + +#include + +#include "types.h" + +struct Decrementable { + using difference_type = int; + + auto operator<=>(const Decrementable&) const = default; + + constexpr Decrementable& operator++(); + constexpr Decrementable operator++(int); + constexpr Decrementable& operator--(); + constexpr Decrementable operator--(int); +}; + +struct Incrementable { + using difference_type = int; + + auto operator<=>(const Incrementable&) const = default; + + constexpr Incrementable& operator++(); + constexpr Incrementable operator++(int); +}; + +static_assert(std::ranges::random_access_range>); +static_assert(std::ranges::random_access_range>); +static_assert(std::ranges::bidirectional_range>); +static_assert(std::ranges::forward_range>); +static_assert(std::ranges::input_range>); +static_assert(std::ranges::view>); diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.default.pass.cpp new file mode 100644 index 0000000000000..0adb29cb46154 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.default.pass.cpp @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// sentinel() = default; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + using Sent = std::ranges::sentinel_t, IntComparableWith>>>; + using Iter = std::ranges::iterator_t, IntComparableWith>>>; + assert(Sent() == Iter()); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.value.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.value.pass.cpp new file mode 100644 index 0000000000000..ebb273873e2bc --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.value.pass.cpp @@ -0,0 +1,48 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr explicit sentinel(Bound bound); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + { + using Sent = std::ranges::sentinel_t>>; + using Iter = std::ranges::iterator_t>>; + auto sent = Sent(IntSentinelWith(42)); + assert(sent == Iter(42)); + } + { + using Sent = std::ranges::sentinel_t>>; + using Iter = std::ranges::iterator_t>>; + auto sent = Sent(IntSentinelWith(SomeInt(42))); + assert(sent == Iter(SomeInt(42))); + } + { + using Sent = std::ranges::sentinel_t>>; + static_assert(!std::is_convertible_v>); + static_assert( std::is_constructible_v>); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/eq.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/eq.pass.cpp new file mode 100644 index 0000000000000..b4b9d01fabd1a --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/eq.pass.cpp @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// friend constexpr bool operator==(const iterator& x, const sentinel& y); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + { + const std::ranges::iota_view> io(0, IntComparableWith(10)); + auto iter = io.begin(); + auto sent = io.end(); + assert(iter != sent); + assert(iter + 10 == sent); + } + { + std::ranges::iota_view> io(0, IntComparableWith(10)); + auto iter = io.begin(); + auto sent = io.end(); + assert(iter != sent); + assert(iter + 10 == sent); + } + { + const std::ranges::iota_view io(SomeInt(0), IntComparableWith(SomeInt(10))); + auto iter = io.begin(); + auto sent = io.end(); + assert(iter != sent); + assert(iter + 10 == sent); + } + { + std::ranges::iota_view io(SomeInt(0), IntComparableWith(SomeInt(10))); + auto iter = io.begin(); + auto sent = io.end(); + assert(iter != sent); + assert(iter + 10 == sent); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/minus.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/minus.pass.cpp new file mode 100644 index 0000000000000..6fd02878ca655 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/minus.pass.cpp @@ -0,0 +1,65 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// friend constexpr iter_difference_t operator-(const iterator& x, const sentinel& y) +// requires sized_­sentinel_­for; +// friend constexpr iter_difference_t operator-(const sentinel& x, const iterator& y) +// requires sized_­sentinel_­for; + +#include +#include + +#include "test_macros.h" +#include "test_iterators.h" +#include "../types.h" + +template +concept MinusInvocable = requires(std::ranges::iota_view> io) { + io.end() - io.begin(); +}; + +constexpr bool test() { + int buffer[8] = {1, 2, 3, 4, 5, 6, 7, 8}; + + { + auto outIter = random_access_iterator(buffer); + std::ranges::iota_view, IntSentinelWith>> io( + outIter, IntSentinelWith>(std::ranges::next(outIter, 8))); + auto iter = io.begin(); + auto sent = io.end(); + assert(iter - sent == -8); + assert(sent - iter == 8); + } + { + auto outIter = random_access_iterator(buffer); + const std::ranges::iota_view, IntSentinelWith>> io( + outIter, IntSentinelWith>(std::ranges::next(outIter, 8))); + const auto iter = io.begin(); + const auto sent = io.end(); + assert(iter - sent == -8); + assert(sent - iter == 8); + } + + { + // The minus operator requires that "W" is an input_or_output_iterator. + static_assert(!MinusInvocable); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/size.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/size.pass.cpp new file mode 100644 index 0000000000000..4491b0f60eabe --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/size.pass.cpp @@ -0,0 +1,101 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr auto size() const requires see below; + +#include +#include +#include + +#include "test_macros.h" +#include "types.h" + +constexpr bool test() { + // Both are integer like and both are less than zero. + { + const std::ranges::iota_view io(-10, -5); + assert(io.size() == 5); + } + { + const std::ranges::iota_view io(-10, -10); + assert(io.size() == 0); + } + + // Both are integer like and "value_" is less than zero. + { + const std::ranges::iota_view io(-10, 10); + assert(io.size() == 20); + } + { +// TODO: this is invalid with the current implementation. We need to file an LWG issue to +// fix this. Essentially the issue is: An int's min and max are -2147483648 and 2147483647 +// which means the negated min cannot be represented as an integer; it needs to be cast to +// an unsigned type first. That seems to be what the +// to-unsigned-like(bound_) + to-unsigned-like(-value_)) +// part of https://eel.is/c++draft/range.iota#view-15 is doing, but I think it's doing it +// wrong. It should be to-unsigned-like(bound_) - to-unsigned-like(value_)) (cast to +// unsigned first). +// const std::ranges::iota_view io(std::numeric_limits::min(), std::numeric_limits::max()); +// assert(io.size() == (static_cast(std::numeric_limits::max()) * 2) + 1); + } + + // It is UB for "bound_" to be less than "value_" i.e.: iota_view io(10, -5). + + // Both are integer like and neither less than zero. + { + const std::ranges::iota_view io(10, 20); + assert(io.size() == 10); + } + { + const std::ranges::iota_view io(10, 10); + assert(io.size() == 0); + } + { + const std::ranges::iota_view io(0, 0); + assert(io.size() == 0); + } + { + const std::ranges::iota_view io(0, std::numeric_limits::max()); + assert(io.size() == std::numeric_limits::max()); + } + + // Neither are integer like. + { + const std::ranges::iota_view io(SomeInt(-20), SomeInt(-10)); + assert(io.size() == 10); + } + { + const std::ranges::iota_view io(SomeInt(-10), SomeInt(-10)); + assert(io.size() == 0); + } + { + const std::ranges::iota_view io(SomeInt(0), SomeInt(0)); + assert(io.size() == 0); + } + { + const std::ranges::iota_view io(SomeInt(10), SomeInt(20)); + assert(io.size() == 10); + } + { + const std::ranges::iota_view io(SomeInt(10), SomeInt(10)); + assert(io.size() == 0); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/type.compile.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/type.compile.pass.cpp new file mode 100644 index 0000000000000..73c4a349194a2 --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/type.compile.pass.cpp @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +#include + +// Test that we SFINAE away iota_view. + +template std::ranges::iota_view f(int); +template void f(...); + +void test() { + f(42); +} diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/types.h b/libcxx/test/std/ranges/range.factories/range.iota.view/types.h new file mode 100644 index 0000000000000..a6eb1b4d537aa --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/types.h @@ -0,0 +1,212 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef TEST_STD_RANGES_RANGE_FACTORIES_RANGE_IOTA_VIEW_TYPES_H +#define TEST_STD_RANGES_RANGE_FACTORIES_RANGE_IOTA_VIEW_TYPES_H + +#include "test_macros.h" + +struct SomeInt { + using difference_type = int; + + int value_; + constexpr explicit SomeInt(int value = 0) : value_(value) {} + + auto operator<=>(const SomeInt&) const = default; + + friend constexpr SomeInt& operator+=(SomeInt &lhs, const SomeInt& rhs) { + lhs.value_ += rhs.value_; return lhs; + } + friend constexpr SomeInt& operator-=(SomeInt &lhs, const SomeInt& rhs) { + lhs.value_ -= rhs.value_; return lhs; + } + + friend constexpr SomeInt& operator+=(SomeInt &lhs, difference_type rhs) { + lhs.value_ += rhs; return lhs; + } + friend constexpr SomeInt& operator-=(SomeInt &lhs, difference_type rhs) { + lhs.value_ -= rhs; return lhs; + } + + friend constexpr SomeInt operator+(SomeInt lhs, SomeInt rhs) { + return SomeInt{lhs.value_ + rhs.value_}; + } + friend constexpr int operator-(SomeInt lhs, SomeInt rhs) { + return lhs.value_ - rhs.value_; + } + + friend constexpr SomeInt operator+(SomeInt lhs, difference_type rhs) { + return SomeInt{lhs.value_ + rhs}; + } + friend constexpr int operator-(SomeInt lhs, difference_type rhs) { + return lhs.value_ - rhs; + } + + friend constexpr SomeInt operator+(difference_type lhs, SomeInt rhs) { + return SomeInt{lhs + rhs.value_}; + } + friend constexpr int operator-(difference_type lhs, SomeInt rhs) { + return lhs - rhs.value_; + } + + constexpr SomeInt& operator++() { ++value_; return *this; } + constexpr SomeInt operator++(int) { auto tmp = *this; ++value_; return tmp; } + constexpr SomeInt& operator--() { --value_; return *this; } + constexpr SomeInt operator--(int) { auto tmp = *this; --value_; return tmp; } +}; + +template +struct IntComparableWith { + using difference_type = std::iter_difference_t; + + T value_; + constexpr explicit IntComparableWith(T value = T()) : value_(value) {} + + friend constexpr bool operator==(IntComparableWith lhs, IntComparableWith rhs) { + return lhs.value_ == rhs.value_; + } + friend constexpr bool operator==(IntComparableWith lhs, T rhs) { + return lhs.value_ == rhs; + } + friend constexpr bool operator==(T lhs, IntComparableWith rhs) { + return lhs == rhs.value_; + } + + friend constexpr IntComparableWith operator+(IntComparableWith lhs, IntComparableWith rhs) { + return IntComparableWith{lhs.value_ + rhs.value_}; + } + friend constexpr difference_type operator-(IntComparableWith lhs, IntComparableWith rhs) { + return lhs.value_ - rhs.value_; + } + + constexpr IntComparableWith& operator++() { ++value_; return *this; } + constexpr IntComparableWith operator++(int) { auto tmp = *this; ++value_; return tmp; } + constexpr IntComparableWith operator--() { --value_; return *this; } +}; + +template +struct IntSentinelWith { + using difference_type = std::iter_difference_t; + + T value_; + constexpr explicit IntSentinelWith(T value = T()) : value_(value) {} + + friend constexpr bool operator==(IntSentinelWith lhs, IntSentinelWith rhs) { + return lhs.value_ == rhs.value_; + } + friend constexpr bool operator==(IntSentinelWith lhs, T rhs) { + return lhs.value_ == rhs; + } + friend constexpr bool operator==(T lhs, IntSentinelWith rhs) { + return lhs == rhs.value_; + } + + friend constexpr IntSentinelWith operator+(IntSentinelWith lhs, IntSentinelWith rhs) { + return IntSentinelWith{lhs.value_ + rhs.value_}; + } + friend constexpr difference_type operator-(IntSentinelWith lhs, IntSentinelWith rhs) { + return lhs.value_ - rhs.value_; + } + friend constexpr difference_type operator-(IntSentinelWith lhs, T rhs) { + return lhs.value_ - rhs; + } + friend constexpr difference_type operator-(T lhs, IntSentinelWith rhs) { + return lhs - rhs.value_; + } + + constexpr IntSentinelWith& operator++() { ++value_; return *this; } + constexpr IntSentinelWith operator++(int) { auto tmp = *this; ++value_; return tmp; } + constexpr IntSentinelWith operator--() { --value_; return *this; } +}; + +struct NotIncrementable { + using difference_type = int; + + int value_; + constexpr explicit NotIncrementable(int value = 0) : value_(value) {} + + bool operator==(const NotIncrementable&) const = default; + + friend constexpr NotIncrementable& operator+=(NotIncrementable &lhs, const NotIncrementable& rhs) { + lhs.value_ += rhs.value_; return lhs; + } + friend constexpr NotIncrementable& operator-=(NotIncrementable &lhs, const NotIncrementable& rhs) { + lhs.value_ -= rhs.value_; return lhs; + } + + friend constexpr NotIncrementable operator+(NotIncrementable lhs, NotIncrementable rhs) { + return NotIncrementable{lhs.value_ + rhs.value_}; + } + friend constexpr int operator-(NotIncrementable lhs, NotIncrementable rhs) { + return lhs.value_ - rhs.value_; + } + + constexpr NotIncrementable& operator++() { ++value_; return *this; } + constexpr void operator++(int) { ++value_; } + constexpr NotIncrementable& operator--() { --value_; return *this; } +}; +static_assert(!std::incrementable); + +struct NotDecrementable { + using difference_type = int; + + int value_; + constexpr explicit NotDecrementable(int value = 0) : value_(value) {} + + bool operator==(const NotDecrementable&) const = default; + + friend constexpr NotDecrementable& operator+=(NotDecrementable &lhs, const NotDecrementable& rhs) { + lhs.value_ += rhs.value_; return lhs; + } + friend constexpr NotDecrementable& operator-=(NotDecrementable &lhs, const NotDecrementable& rhs) { + lhs.value_ -= rhs.value_; return lhs; + } + + friend constexpr NotDecrementable operator+(NotDecrementable lhs, NotDecrementable rhs) { + return NotDecrementable{lhs.value_ + rhs.value_}; + } + friend constexpr int operator-(NotDecrementable lhs, NotDecrementable rhs) { + return lhs.value_ - rhs.value_; + } + + constexpr NotDecrementable& operator++() { ++value_; return *this; } + constexpr void operator++(int) { ++value_; } +}; + +enum CtorKind { DefaultTo42, ValueCtor }; + +template +struct Int42 { + using difference_type = int; + + int value_; + constexpr explicit Int42(int value) : value_(value) {} + constexpr explicit Int42() requires (CK == DefaultTo42) + : value_(42) {} + + bool operator==(const Int42&) const = default; + + friend constexpr Int42& operator+=(Int42 &lhs, const Int42& rhs) { + lhs.value_ += rhs.value_; return lhs; + } + friend constexpr Int42& operator-=(Int42 &lhs, const Int42& rhs) { + lhs.value_ -= rhs.value_; return lhs; + } + + friend constexpr Int42 operator+(Int42 lhs, Int42 rhs) { + return Int42{lhs.value_ + rhs.value_}; + } + friend constexpr int operator-(Int42 lhs, Int42 rhs) { + return lhs.value_ - rhs.value_; + } + + constexpr Int42& operator++() { ++value_; return *this; } + constexpr void operator++(int) { ++value_; } +}; + +#endif // TEST_STD_RANGES_RANGE_FACTORIES_RANGE_IOTA_VIEW_TYPES_H diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp new file mode 100644 index 0000000000000..661285585095c --- /dev/null +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp @@ -0,0 +1,83 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// views::iota + +#include +#include +#include + +#include "test_macros.h" +#include "types.h" + +template +constexpr void testType(U u) { + // Test that this generally does the right thing. + // Test with only one argument. + { + assert(*std::views::iota(T(0)).begin() == T(0)); + } + { + const auto io = std::views::iota(T(10)); + assert(*io.begin() == T(10)); + } + // Test with two arguments. + { + assert(*std::views::iota(T(0), u).begin() == T(0)); + } + { + const auto io = std::views::iota(T(10), u); + assert(*io.begin() == T(10)); + } + // Test that we return the correct type. + { + ASSERT_SAME_TYPE(decltype(std::views::iota(T(10))), std::ranges::iota_view); + ASSERT_SAME_TYPE(decltype(std::views::iota(T(10), u)), std::ranges::iota_view); + } + // Test that this is semiregular. + // Note: we cannot test perfect forwarding because both T and U must be copyable. + { + static_assert(std::semiregular>); + } +} + +struct X {}; + +constexpr bool test() { + testType(SomeInt(10)); + testType(IntComparableWith(SomeInt(10))); + testType(IntComparableWith(10)); + testType(IntComparableWith(10)); + testType(IntComparableWith(10)); + testType(int(10)); + testType(unsigned(10)); + testType(IntComparableWith(10)); + testType(short(10)); + testType(IntComparableWith(10)); + testType(IntComparableWith(10)); + + { + static_assert( std::is_invocable_v); + static_assert(!std::is_invocable_v); + static_assert( std::is_invocable_v); + static_assert(!std::is_invocable_v); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} From df324bba5c4cc0309ef4bc756fab4ebb6d67dfba Mon Sep 17 00:00:00 2001 From: zoecarver Date: Fri, 6 Aug 2021 15:33:46 -0700 Subject: [PATCH 005/700] [libcxx][ranges] Add `ranges::join_view`. Differential Revision: https://reviews.llvm.org/D107671 --- libcxx/docs/Status/RangesPaper.csv | 2 +- libcxx/include/CMakeLists.txt | 1 + libcxx/include/__ranges/join_view.h | 350 ++++++++++++++++++ .../include/__ranges/non_propagating_cache.h | 8 + libcxx/include/module.modulemap | 1 + libcxx/include/ranges | 6 + .../ranges/join_view.module.verify.cpp | 16 + .../range.join.view/base.pass.cpp | 62 ++++ .../range.join.view/begin.pass.cpp | 97 +++++ .../range.join.view/ctad.compile.pass.cpp | 72 ++++ .../range.join.view/ctad.verify.cpp | 33 ++ .../range.join.view/ctor.base.pass.cpp | 49 +++ .../range.join.view/ctor.default.pass.cpp | 37 ++ .../range.join.view/end.pass.cpp | 120 ++++++ .../range.join.view/general.pass.cpp | 51 +++ .../range.join.view/iterator/arrow.pass.cpp | 50 +++ .../iterator/ctor.default.pass.cpp | 56 +++ .../iterator/ctor.other.pass.cpp | 41 ++ .../iterator/ctor.parent.outer.pass.cpp | 38 ++ .../iterator/decrement.pass.cpp | 74 ++++ .../range.join.view/iterator/eq.pass.cpp | 41 ++ .../iterator/increment.pass.cpp | 160 ++++++++ .../iterator/iter.move.pass.cpp | 38 ++ .../iterator/iter.swap.pass.cpp | 43 +++ .../iterator/member_types.compile.pass.cpp | 67 ++++ .../range.join.view/iterator/star.pass.cpp | 55 +++ .../sentinel/ctor.default.pass.cpp | 33 ++ .../sentinel/ctor.other.pass.cpp | 41 ++ .../sentinel/ctor.parent.pass.cpp | 45 +++ .../range.join.view/sentinel/eq.pass.cpp | 52 +++ .../range.adaptors/range.join.view/types.h | 141 +++++++ libcxx/test/support/test_iterators.h | 13 + libcxx/test/support/test_range.h | 6 + 33 files changed, 1898 insertions(+), 1 deletion(-) create mode 100644 libcxx/include/__ranges/join_view.h create mode 100644 libcxx/test/libcxx/diagnostics/detail.headers/ranges/join_view.module.verify.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/base.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/begin.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.compile.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.verify.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.base.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.default.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/end.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/general.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/arrow.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.default.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.other.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.parent.outer.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/decrement.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/eq.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/increment.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.move.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.swap.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/member_types.compile.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/star.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.default.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.other.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.parent.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/eq.pass.cpp create mode 100644 libcxx/test/std/ranges/range.adaptors/range.join.view/types.h diff --git a/libcxx/docs/Status/RangesPaper.csv b/libcxx/docs/Status/RangesPaper.csv index 77239c314f787..fcdbe8fb61dd1 100644 --- a/libcxx/docs/Status/RangesPaper.csv +++ b/libcxx/docs/Status/RangesPaper.csv @@ -140,7 +140,7 @@ Section,Description,Dependencies,Assignee,Complete `[range.transform] `_,`transform_view `_,[range.all],Zoe Carver,✅ `[range.iota] `_,iota_view,[range.all],Zoe Carver,✅ `[range.take] `_,take_view,[range.all],Zoe Carver,✅ -`[range.join] `_,join_view,[range.all],Zoe Carver,In Progress +`[range.join] `_,join_view,[range.all],Zoe Carver,✅ `[range.empty] `_,`empty_view `_,[view.interface],Zoe Carver,✅ `[range.single] `_,single_view,[view.interface],Zoe Carver,✅ `[range.split] `_,split_view,[range.all],Zoe Carver,In Progress diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 914c74c7ceb60..f30580e5e8aa5 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -224,6 +224,7 @@ set(files __ranges/enable_borrowed_range.h __ranges/enable_view.h __ranges/iota_view.h + __ranges/join_view.h __ranges/non_propagating_cache.h __ranges/ref_view.h __ranges/reverse_view.h diff --git a/libcxx/include/__ranges/join_view.h b/libcxx/include/__ranges/join_view.h new file mode 100644 index 0000000000000..44aa1d0264e6d --- /dev/null +++ b/libcxx/include/__ranges/join_view.h @@ -0,0 +1,350 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef _LIBCPP___RANGES_JOIN_VIEW_H +#define _LIBCPP___RANGES_JOIN_VIEW_H + +#include <__config> +#include <__iterator/concepts.h> +#include <__iterator/iterator_traits.h> +#include <__ranges/access.h> +#include <__ranges/all.h> +#include <__ranges/concepts.h> +#include <__ranges/non_propagating_cache.h> +#include <__ranges/ref_view.h> +#include <__ranges/subrange.h> +#include <__ranges/view_interface.h> +#include <__utility/declval.h> +#include <__utility/forward.h> +#include +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +#pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +#if !defined(_LIBCPP_HAS_NO_RANGES) + +namespace ranges { + template + struct __join_view_iterator_category {}; + + template + requires is_reference_v> && + forward_range<_View> && + forward_range> + struct __join_view_iterator_category<_View> { + using _OuterC = typename iterator_traits>::iterator_category; + using _InnerC = typename iterator_traits>>::iterator_category; + + using iterator_category = _If< + derived_from<_OuterC, bidirectional_iterator_tag> && derived_from<_InnerC, bidirectional_iterator_tag>, + bidirectional_iterator_tag, + _If< + derived_from<_OuterC, forward_iterator_tag> && derived_from<_InnerC, forward_iterator_tag>, + forward_iterator_tag, + input_iterator_tag + > + >; + }; + + template + requires view<_View> && input_range> + class join_view + : public view_interface> { + private: + using _InnerRange = range_reference_t<_View>; + + template struct __iterator; + template struct __sentinel; + + static constexpr bool _UseCache = !is_reference_v<_InnerRange>; + using _Cache = _If<_UseCache, __non_propagating_cache>, __empty_cache>; + [[no_unique_address]] _Cache __cache_; + _View __base_ = _View(); // TODO: [[no_unique_address]] makes clang crash! File a bug :) + + public: + _LIBCPP_HIDE_FROM_ABI + join_view() requires default_initializable<_View> = default; + + _LIBCPP_HIDE_FROM_ABI + constexpr explicit join_view(_View __base) + : __base_(_VSTD::move(__base)) {} + + _LIBCPP_HIDE_FROM_ABI + constexpr _View base() const& requires copy_constructible<_View> { return __base_; } + + _LIBCPP_HIDE_FROM_ABI + constexpr _View base() && { return _VSTD::move(__base_); } + + _LIBCPP_HIDE_FROM_ABI + constexpr auto begin() { + constexpr bool __use_const = __simple_view<_View> && + is_reference_v>; + return __iterator<__use_const>{*this, ranges::begin(__base_)}; + } + + template + _LIBCPP_HIDE_FROM_ABI + constexpr auto begin() const + requires input_range && + is_reference_v> + { + return __iterator{*this, ranges::begin(__base_)}; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr auto end() { + if constexpr (forward_range<_View> && + is_reference_v<_InnerRange> && + forward_range<_InnerRange> && + common_range<_View> && + common_range<_InnerRange>) + return __iterator<__simple_view<_View>>{*this, ranges::end(__base_)}; + else + return __sentinel<__simple_view<_View>>{*this}; + } + + template + _LIBCPP_HIDE_FROM_ABI + constexpr auto end() const + requires input_range && + is_reference_v> + { + using _ConstInnerRange = range_reference_t; + if constexpr (forward_range && + is_reference_v<_ConstInnerRange> && + forward_range<_ConstInnerRange> && + common_range && + common_range<_ConstInnerRange>) { + return __iterator{*this, ranges::end(__base_)}; + } else { + return __sentinel{*this}; + } + } + }; + + template + requires view<_View> && input_range> + template struct join_view<_View>::__sentinel { + template friend struct __sentinel; + + private: + using _Parent = __maybe_const<_Const, join_view>; + using _Base = __maybe_const<_Const, _View>; + sentinel_t<_Base> __end_ = sentinel_t<_Base>(); + + public: + _LIBCPP_HIDE_FROM_ABI + __sentinel() = default; + + _LIBCPP_HIDE_FROM_ABI + constexpr explicit __sentinel(_Parent& __parent) + : __end_(ranges::end(__parent.__base_)) {} + + _LIBCPP_HIDE_FROM_ABI + constexpr __sentinel(__sentinel __s) + requires _Const && convertible_to, sentinel_t<_Base>> + : __end_(_VSTD::move(__s.__end_)) {} + + template + requires sentinel_for, iterator_t<__maybe_const<_OtherConst, _View>>> + _LIBCPP_HIDE_FROM_ABI + friend constexpr bool operator==(const __iterator<_OtherConst>& __x, const __sentinel& __y) { + return __x.__outer_ == __y.__end_; + } + }; + + template + requires view<_View> && input_range> + template struct join_view<_View>::__iterator + : public __join_view_iterator_category<__maybe_const<_Const, _View>> { + + template friend struct __iterator; + + private: + using _Parent = __maybe_const<_Const, join_view>; + using _Base = __maybe_const<_Const, _View>; + using _Outer = iterator_t<_Base>; + using _Inner = iterator_t>; + + static constexpr bool __ref_is_glvalue = is_reference_v>; + + public: + _Outer __outer_ = _Outer(); + + private: + optional<_Inner> __inner_; + _Parent *__parent_ = nullptr; + + _LIBCPP_HIDE_FROM_ABI + constexpr void __satisfy() { + for (; __outer_ != ranges::end(__parent_->__base_); ++__outer_) { + auto&& __inner = [&]() -> auto&& { + if constexpr (__ref_is_glvalue) + return *__outer_; + else + return __parent_->__cache_.__emplace_deref(__outer_); + }(); + __inner_ = ranges::begin(__inner); + if (*__inner_ != ranges::end(__inner)) + return; + } + + if constexpr (__ref_is_glvalue) + __inner_.reset(); + } + + public: + using iterator_concept = _If< + __ref_is_glvalue && bidirectional_range<_Base> && bidirectional_range>, + bidirectional_iterator_tag, + _If< + __ref_is_glvalue && forward_range<_Base> && forward_range>, + forward_iterator_tag, + input_iterator_tag + > + >; + + using value_type = range_value_t>; + + using difference_type = common_type_t< + range_difference_t<_Base>, range_difference_t>>; + + _LIBCPP_HIDE_FROM_ABI + __iterator() requires default_initializable<_Outer> = default; + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator(_Parent& __parent, _Outer __outer) + : __outer_(_VSTD::move(__outer)) + , __parent_(_VSTD::addressof(__parent)) { + __satisfy(); + } + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator(__iterator __i) + requires _Const && + convertible_to, _Outer> && + convertible_to, _Inner> + : __outer_(_VSTD::move(__i.__outer_)) + , __inner_(_VSTD::move(__i.__inner_)) + , __parent_(__i.__parent_) {} + + _LIBCPP_HIDE_FROM_ABI + constexpr decltype(auto) operator*() const { + return **__inner_; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr _Inner operator->() const + requires __has_arrow<_Inner> && copyable<_Inner> + { + return *__inner_; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator& operator++() { + auto&& __inner = [&]() -> auto&& { + if constexpr (__ref_is_glvalue) + return *__outer_; + else + return *__parent_->__cache_; + }(); + if (++*__inner_ == ranges::end(__inner)) { + ++__outer_; + __satisfy(); + } + return *this; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr void operator++(int) { + ++*this; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator operator++(int) + requires __ref_is_glvalue && + forward_range<_Base> && + forward_range> + { + auto __tmp = *this; + ++*this; + return __tmp; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator& operator--() + requires __ref_is_glvalue && + bidirectional_range<_Base> && + bidirectional_range> && + common_range> + { + if (__outer_ == ranges::end(__parent_->__base_)) + __inner_ = ranges::end(*--__outer_); + + // Skip empty inner ranges when going backwards. + while (*__inner_ == ranges::begin(*__outer_)) { + __inner_ = ranges::end(*--__outer_); + } + + --*__inner_; + return *this; + } + + _LIBCPP_HIDE_FROM_ABI + constexpr __iterator operator--(int) + requires __ref_is_glvalue && + bidirectional_range<_Base> && + bidirectional_range> && + common_range> + { + auto __tmp = *this; + --*this; + return __tmp; + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr bool operator==(const __iterator& __x, const __iterator& __y) + requires __ref_is_glvalue && + equality_comparable> && + equality_comparable>> + { + return __x.__outer_ == __y.__outer_ && __x.__inner_ == __y.__inner_; + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr decltype(auto) iter_move(const __iterator& __i) + noexcept(noexcept(ranges::iter_move(*__i.__inner_))) + { + return ranges::iter_move(*__i.__inner_); + } + + _LIBCPP_HIDE_FROM_ABI + friend constexpr void iter_swap(const __iterator& __x, const __iterator& __y) + noexcept(noexcept(ranges::iter_swap(*__x.__inner_, *__y.__inner_))) + requires indirectly_swappable<_Inner> + { + return ranges::iter_swap(*__x.__inner_, *__y.__inner_); + } + }; + + template + explicit join_view(_Range&&) -> join_view>; + +} // namespace ranges + +#undef _CONSTEXPR_TERNARY + +#endif // !defined(_LIBCPP_HAS_NO_RANGES) + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___RANGES_JOIN_VIEW_H diff --git a/libcxx/include/__ranges/non_propagating_cache.h b/libcxx/include/__ranges/non_propagating_cache.h index 878f7070a07f4..76577f47a5ad4 100644 --- a/libcxx/include/__ranges/non_propagating_cache.h +++ b/libcxx/include/__ranges/non_propagating_cache.h @@ -85,6 +85,14 @@ namespace ranges { constexpr void __set(_Tp const& __value) { __value_.emplace(__value); } _LIBCPP_HIDE_FROM_ABI constexpr void __set(_Tp&& __value) { __value_.emplace(_VSTD::move(__value)); } + + template + _LIBCPP_HIDE_FROM_ABI + constexpr _Tp& __emplace_deref(const _Other& __value) { + __value_.reset(); + __value_.emplace(*__value); + return *__value_; + } }; struct __empty_cache { }; diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 079dff201804b..f9955a3cd0c3b 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -659,6 +659,7 @@ module std [system] { module enable_borrowed_range { private header "__ranges/enable_borrowed_range.h" } module enable_view { private header "__ranges/enable_view.h" } module iota_view { private header "__ranges/iota_view.h" } + module join_view { private header "__ranges/join_view.h" } module non_propagating_cache { private header "__ranges/non_propagating_cache.h" } module ref_view { private header "__ranges/ref_view.h" } module reverse_view { private header "__ranges/reverse_view.h" } diff --git a/libcxx/include/ranges b/libcxx/include/ranges index df8d4194ffa14..014260aaee15b 100644 --- a/libcxx/include/ranges +++ b/libcxx/include/ranges @@ -184,6 +184,11 @@ namespace std::ranges { template inline constexpr bool enable_borrowed_range> = true; + + // [range.join], join view + template + requires view && input_range> + class join_view; } */ @@ -207,6 +212,7 @@ namespace std::ranges { #include <__ranges/enable_borrowed_range.h> #include <__ranges/enable_view.h> #include <__ranges/iota_view.h> +#include <__ranges/join_view.h> #include <__ranges/ref_view.h> #include <__ranges/reverse_view.h> #include <__ranges/take_view.h> diff --git a/libcxx/test/libcxx/diagnostics/detail.headers/ranges/join_view.module.verify.cpp b/libcxx/test/libcxx/diagnostics/detail.headers/ranges/join_view.module.verify.cpp new file mode 100644 index 0000000000000..ff48b744d5d93 --- /dev/null +++ b/libcxx/test/libcxx/diagnostics/detail.headers/ranges/join_view.module.verify.cpp @@ -0,0 +1,16 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: modules-build + +// WARNING: This test was generated by 'generate_private_header_tests.py' +// and should not be edited manually. + +// expected-error@*:* {{use of private header from outside its module: '__ranges/join_view.h'}} +#include <__ranges/join_view.h> diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/base.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/base.pass.cpp new file mode 100644 index 0000000000000..60cbb26f2502d --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/base.pass.cpp @@ -0,0 +1,62 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr V base() const& requires copy_constructible; +// constexpr V base() &&; + +#include +#include + +#include "test_macros.h" +#include "types.h" + +constexpr bool hasLValueQualifiedBase(auto&& view) { + return requires { view.base(); }; +} + +constexpr bool test() { + int buffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}}; + + { + ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])}; + auto jv = std::ranges::join_view(ParentView{children}); + assert(std::move(jv).base().ptr_ == children); + + static_assert(!hasLValueQualifiedBase(jv)); + ASSERT_SAME_TYPE(decltype(std::move(jv).base()), ParentView); + } + + { + std::ranges::join_view jv(buffer); + assert(jv.base().base() == buffer + 0); + + static_assert(hasLValueQualifiedBase(jv)); + ASSERT_SAME_TYPE(decltype(jv.base()), std::ranges::ref_view); + } + + { + const std::ranges::join_view jv(buffer); + assert(jv.base().base() == buffer + 0); + + static_assert(hasLValueQualifiedBase(jv)); + ASSERT_SAME_TYPE(decltype(jv.base()), std::ranges::ref_view); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/begin.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/begin.pass.cpp new file mode 100644 index 0000000000000..2441f6787f2ba --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/begin.pass.cpp @@ -0,0 +1,97 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr auto begin(); +// constexpr auto begin() const; + +#include +#include + +#include "test_macros.h" +#include "types.h" + +constexpr bool test() { + int buffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}}; + + { + ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])}; + auto jv = std::ranges::join_view(ParentView{children}); + assert(*jv.begin() == 1111); + } + + { + CopyableChild children[4] = {CopyableChild(buffer[0], 4), CopyableChild(buffer[1], 0), CopyableChild(buffer[2], 1), CopyableChild(buffer[3], 0)}; + auto jv = std::ranges::join_view(ParentView{children}); + assert(*jv.begin() == 1111); + } + // Parent is empty. + { + CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])}; + std::ranges::join_view jv(ParentView(children, 0)); + assert(jv.begin() == jv.end()); + } + // Parent size is one. + { + CopyableChild children[1] = {CopyableChild(buffer[0])}; + std::ranges::join_view jv(ParentView(children, 1)); + assert(*jv.begin() == 1111); + } + // Parent and child size is one. + { + CopyableChild children[1] = {CopyableChild(buffer[0], 1)}; + std::ranges::join_view jv(ParentView(children, 1)); + assert(*jv.begin() == 1111); + } + // Parent size is one child is empty + { + CopyableChild children[1] = {CopyableChild(buffer[0], 0)}; + std::ranges::join_view jv(ParentView(children, 1)); + assert(jv.begin() == jv.end()); + } + // Has all empty children. + { + CopyableChild children[4] = {CopyableChild(buffer[0], 0), CopyableChild(buffer[1], 0), CopyableChild(buffer[2], 0), CopyableChild(buffer[3], 0)}; + auto jv = std::ranges::join_view(ParentView{children}); + assert(jv.begin() == jv.end()); + } + // First child is empty, others are not. + { + CopyableChild children[4] = {CopyableChild(buffer[0], 4), CopyableChild(buffer[1], 0), CopyableChild(buffer[2], 0), CopyableChild(buffer[3], 0)}; + auto jv = std::ranges::join_view(ParentView{children}); + assert(*jv.begin() == 1111); + } + // Last child is empty, others are not. + { + CopyableChild children[4] = {CopyableChild(buffer[0], 4), CopyableChild(buffer[1], 4), CopyableChild(buffer[2], 4), CopyableChild(buffer[3], 0)}; + auto jv = std::ranges::join_view(ParentView{children}); + assert(*jv.begin() == 1111); + } + + { + std::ranges::join_view jv(buffer); + assert(*jv.begin() == 1111); + } + + { + const std::ranges::join_view jv(buffer); + assert(*jv.begin() == 1111); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.compile.pass.cpp new file mode 100644 index 0000000000000..a81fa03e15c21 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.compile.pass.cpp @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// template +// explicit join_view(R&&) -> join_view>; + +#include + +#include "test_iterators.h" + +template +struct View : std::ranges::view_base { + // All friends here are defined to prevent GCC warnings. + friend T* begin(View&) { return nullptr; } + friend T* begin(View const&) { return nullptr; } + friend sentinel_wrapper end(View&) { return sentinel_wrapper(nullptr); } + friend sentinel_wrapper end(View const&) { return sentinel_wrapper(nullptr); } +}; + +template +struct Range { + friend T* begin(Range&) { return nullptr; } + friend T* begin(Range const&) { return nullptr; } + friend sentinel_wrapper end(Range&) { return sentinel_wrapper(nullptr); } + friend sentinel_wrapper end(Range const&) { return sentinel_wrapper(nullptr); } +}; + +template +struct BorrowedRange { + friend T* begin(BorrowedRange&) { return nullptr; } + friend T* begin(BorrowedRange const&) { return nullptr; } + friend sentinel_wrapper end(BorrowedRange&) { return sentinel_wrapper(nullptr); } + friend sentinel_wrapper end(BorrowedRange const&) { return sentinel_wrapper(nullptr); } +}; + +template<> +inline constexpr bool std::ranges::enable_borrowed_range>> = true; + +void testCTAD() { + View> v; + Range> r; + BorrowedRange> br; + + static_assert(std::same_as< + decltype(std::ranges::join_view(v)), + std::ranges::join_view>> + >); + static_assert(std::same_as< + decltype(std::ranges::join_view(r)), + std::ranges::join_view>>> + >); + // std::ranges::join_view(std::move(r)) invalid. RValue range must be borrowed. + static_assert(std::same_as< + decltype(std::ranges::join_view(br)), + std::ranges::join_view>>> + >); + static_assert(std::same_as< + decltype(std::ranges::join_view(std::move(br))), + std::ranges::join_view *, + sentinel_wrapper *>, + std::ranges::subrange_kind::unsized>> + >); +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.verify.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.verify.cpp new file mode 100644 index 0000000000000..1bdd1e62eeb64 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.verify.cpp @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// template +// explicit join_view(R&&) -> join_view>; + +// Tests that the deduction guide is explicit. + +#include + +#include "test_iterators.h" + +template +struct Range { + friend T* begin(Range&) { return nullptr; } + friend T* begin(Range const&) { return nullptr; } + friend sentinel_wrapper end(Range&) { return sentinel_wrapper(nullptr); } + friend sentinel_wrapper end(Range const&) { return sentinel_wrapper(nullptr); } +}; + +void testExplicitCTAD() { + Range> r; + std::ranges::join_view v = r; // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'join_view'}} +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.base.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.base.pass.cpp new file mode 100644 index 0000000000000..2cdbe3b0268c4 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.base.pass.cpp @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr explicit join_view(V base); + +#include +#include + +#include "test_macros.h" +#include "types.h" + +constexpr bool test() { + int buffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}}; + + { + ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])}; + auto jv = std::ranges::join_view(ParentView{children}); + assert(std::move(jv).base().ptr_ == children); + } + + { + std::ranges::join_view jv(buffer); + assert(jv.base().base() == buffer + 0); + } + + { + // Test explicitness. + static_assert( std::is_constructible_v>, ParentView>); + static_assert(!std::is_convertible_v>, ParentView>); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.default.pass.cpp new file mode 100644 index 0000000000000..ff93d8aa6fdf8 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.default.pass.cpp @@ -0,0 +1,37 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// join_view() requires default_initializable = default; + +#include +#include + +#include "test_macros.h" +#include "types.h" + + +constexpr bool test() { + std::ranges::join_view> jv; + assert(std::move(jv).base().ptr_ == globalChildren); + + static_assert( std::default_initializable>>); + static_assert(!std::default_initializable>); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/end.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/end.pass.cpp new file mode 100644 index 0000000000000..33ef7a7374d3e --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/end.pass.cpp @@ -0,0 +1,120 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr auto end(); +// constexpr auto end() const; + +#include +#include +#include + +#include "test_macros.h" +#include "types.h" + + +constexpr bool test() { + int buffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}}; + + // Non const common, forward range. + { + std::ranges::join_view jv(buffer); + assert(jv.end() == std::ranges::next(jv.begin(), 16)); + + static_assert(std::same_as); + } + + // Non const not common, input range. + { + ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])}; + auto jv = std::ranges::join_view(ParentView(children)); + assert(jv.end() == std::ranges::next(jv.begin(), 16)); + + static_assert(!std::same_as); + } + + // Const common, forward range. + { + const std::ranges::join_view jv(buffer); + assert(jv.end() == std::ranges::next(jv.begin(), 16)); + + static_assert(std::same_as); + } + + // Const not common, input range. + { + static_assert(std::is_reference_v>); + + CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])}; + const auto jv = std::ranges::join_view(ParentView(children)); + assert(jv.end() == std::ranges::next(jv.begin(), 16)); + + static_assert(!std::same_as); + } + + // Has some empty children. + { + CopyableChild children[4] = {CopyableChild(buffer[0], 4), CopyableChild(buffer[1], 0), CopyableChild(buffer[2], 1), CopyableChild(buffer[3], 0)}; + auto jv = std::ranges::join_view(ParentView(children)); + assert(jv.end() == std::ranges::next(jv.begin(), 5)); + } + // Parent is empty. + { + CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])}; + std::ranges::join_view jv(ParentView(children, 0)); + assert(jv.end() == jv.begin()); + } + // Parent size is one. + { + CopyableChild children[1] = {CopyableChild(buffer[0])}; + std::ranges::join_view jv(ParentView(children, 1)); + assert(jv.end() == std::ranges::next(jv.begin(), 4)); + } + // Parent and child size is one. + { + CopyableChild children[1] = {CopyableChild(buffer[0], 1)}; + std::ranges::join_view jv(ParentView(children, 1)); + assert(jv.end() == std::ranges::next(jv.begin())); + } + // Parent size is one child is empty + { + CopyableChild children[1] = {CopyableChild(buffer[0], 0)}; + std::ranges::join_view jv(ParentView(children, 1)); + assert(jv.end() == jv.begin()); + } + // Has all empty children. + { + CopyableChild children[4] = {CopyableChild(buffer[0], 0), CopyableChild(buffer[1], 0), CopyableChild(buffer[2], 0), CopyableChild(buffer[3], 0)}; + auto jv = std::ranges::join_view(ParentView(children)); + assert(jv.end() == jv.begin()); + } + // First child is empty, others are not. + { + CopyableChild children[4] = {CopyableChild(buffer[0], 4), CopyableChild(buffer[1], 0), CopyableChild(buffer[2], 0), CopyableChild(buffer[3], 0)}; + auto jv = std::ranges::join_view(ParentView(children)); + assert(jv.end() == std::ranges::next(jv.begin(), 4)); + } + // Last child is empty, others are not. + { + CopyableChild children[4] = {CopyableChild(buffer[0], 4), CopyableChild(buffer[1], 4), CopyableChild(buffer[2], 4), CopyableChild(buffer[3], 0)}; + auto jv = std::ranges::join_view(ParentView(children)); + assert(jv.end() == std::ranges::next(jv.begin(), 12)); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/general.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/general.pass.cpp new file mode 100644 index 0000000000000..e0fb8a8c6ddc1 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/general.pass.cpp @@ -0,0 +1,51 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// General tests for join_view. This file does not test anything specifically. + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "types.h" + + +template +bool isEqual(R &r, I i) { + for (auto e : r) + if (e != *i++) + return false; + return true; +} + +int main(int, char**) { + { + int buffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}}; + int *flattened = reinterpret_cast(buffer); + + ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])}; + auto jv = std::ranges::join_view(ParentView(children)); + assert(isEqual(jv, flattened)); + } + + { + std::vector vec = {"Hello", ",", " ", "World", "!"}; + std::string check = "Hello, World!"; + std::ranges::join_view jv(vec); + assert(isEqual(jv, check.begin())); + } + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/arrow.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/arrow.pass.cpp new file mode 100644 index 0000000000000..1579f56151645 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/arrow.pass.cpp @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr InnerIter operator->() const +// requires has-arrow && copyable; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + Box buffer[4][4] = {{{1111}, {2222}, {3333}, {4444}}, {{555}, {666}, {777}, {888}}, {{99}, {1010}, {1111}, {1212}}, {{13}, {14}, {15}, {16}}}; + + { + // Copyable input iterator with arrow. + ValueView children[4] = {ValueView(buffer[0]), ValueView(buffer[1]), ValueView(buffer[2]), ValueView(buffer[3])}; + std::ranges::join_view jv(ValueView>{children}); + assert(jv.begin()->x == 1111); + } + + { + std::ranges::join_view jv(buffer); + assert(jv.begin()->x == 1111); + } + + { + const std::ranges::join_view jv(buffer); + assert(jv.begin()->x == 1111); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.default.pass.cpp new file mode 100644 index 0000000000000..52bae5bb752fb --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.default.pass.cpp @@ -0,0 +1,56 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// iterator() requires default_initializable = default; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +template +struct DefaultCtorParent : std::ranges::view_base { + T *ptr_; + constexpr DefaultCtorParent(T *ptr) : ptr_(ptr) {} + + constexpr cpp17_input_iterator begin() { return cpp17_input_iterator(ptr_); } + constexpr cpp17_input_iterator begin() const { return cpp17_input_iterator(ptr_); } + constexpr T *end() { return ptr_ + 4; } + constexpr const T *end() const { return ptr_ + 4; } +}; + +template +constexpr bool operator==(const cpp17_input_iterator &lhs, const T *rhs) { return lhs.base() == rhs; } +template +constexpr bool operator==(const T *lhs, const cpp17_input_iterator &rhs) { return rhs.base() == lhs; } + +constexpr bool test() { + using Base = DefaultCtorParent; + // Note, only the outer iterator is default_initializable: + static_assert( std::default_initializable>); + static_assert(!std::default_initializable>>); + + std::ranges::iterator_t> iter1; + (void) iter1; + + static_assert(!std::default_initializable>>>); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.other.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.other.pass.cpp new file mode 100644 index 0000000000000..87290c4baec86 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.other.pass.cpp @@ -0,0 +1,41 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr iterator(iterator i); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; + + CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])}; + std::ranges::join_view jv(CopyableParent{children}); + auto iter1 = jv.begin(); + std::ranges::iterator_t iter2 = iter1; + assert(iter1 == iter2); + + // We cannot create a non-const iterator from a const iterator. + static_assert(!std::constructible_from); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.parent.outer.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.parent.outer.pass.cpp new file mode 100644 index 0000000000000..ae6ca21e72a94 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.parent.outer.pass.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr iterator(Parent& parent, OuterIter outer); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; + + CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])}; + CopyableParent parent{children}; + std::ranges::join_view jv(parent); + std::ranges::iterator_t iter(jv, std::ranges::begin(parent)); + assert(*iter == 1); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/decrement.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/decrement.pass.cpp new file mode 100644 index 0000000000000..66b3749365461 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/decrement.pass.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr iterator& operator--(); +// constexpr iterator operator--(int); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; + + { + // outer == ranges::end + std::ranges::join_view jv(buffer); + auto iter = std::next(jv.begin(), 16); + for (int i = 16; i != 0; --i) { + assert(*--iter == i); + } + } + { + // outer == ranges::end + std::ranges::join_view jv(buffer); + auto iter = std::next(jv.begin(), 13); + for (int i = 13; i != 0; --i) { + assert(*--iter == i); + } + } + { + // outer != ranges::end + std::ranges::join_view jv(buffer); + auto iter = std::next(jv.begin(), 12); + for (int i = 12; i != 0; --i) { + assert(*--iter == i); + } + } + { + // outer != ranges::end + std::ranges::join_view jv(buffer); + auto iter = std::next(jv.begin()); + for (int i = 1; i != 0; --i) { + assert(*--iter == i); + } + } + { + int small[2][1] = {{1}, {2}}; + std::ranges::join_view jv(small); + auto iter = std::next(jv.begin(), 2); + for (int i = 2; i != 0; --i) { + assert(*--iter == i); + } + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/eq.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/eq.pass.cpp new file mode 100644 index 0000000000000..b76f72a453cc5 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/eq.pass.cpp @@ -0,0 +1,41 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// friend constexpr bool operator==(const iterator& x, const iterator& y); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; + + std::ranges::join_view jv(buffer); + auto iter1 = jv.begin(); + auto iter2 = jv.begin(); + assert(iter1 == iter2); + iter1++; + assert(iter1 != iter2); + iter2++; + assert(iter1 == iter2); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/increment.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/increment.pass.cpp new file mode 100644 index 0000000000000..853ed1a27f577 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/increment.pass.cpp @@ -0,0 +1,160 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr iterator& operator++(); +// constexpr void operator++(int); +// constexpr iterator operator++(int); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + // This way if we read past end we'll catch the error. + int buffer1[2][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}}; + int dummy = 42; + (void) dummy; + int buffer2[2][4] = {{9, 10, 11, 12}, {13, 14, 15, 16}}; + + // operator++(int); + { + std::ranges::join_view jv(buffer1); + auto iter = jv.begin(); + for (int i = 1; i < 9; ++i) { + assert(*iter++ == i); + } + } + { + ValueView children[4] = {ValueView(buffer1[0]), ValueView(buffer1[1]), ValueView(buffer2[0]), ValueView(buffer2[1])}; + std::ranges::join_view jv(ValueView>{children}); + auto iter = jv.begin(); + for (int i = 1; i < 17; ++i) { + assert(*iter == i); + iter++; + } + + ASSERT_SAME_TYPE(decltype(iter++), void); + } + { + std::ranges::join_view jv(buffer1); + auto iter = std::next(jv.begin(), 7); + assert(*iter++ == 8); + assert(iter == jv.end()); + } + { + int small[2][1] = {{1}, {2}}; + std::ranges::join_view jv(small); + auto iter = jv.begin(); + for (int i = 1; i < 3; ++i) { + assert(*iter++ == i); + } + } + // Has some empty children. + { + CopyableChild children[4] = {CopyableChild(buffer1[0], 4), CopyableChild(buffer1[1], 0), CopyableChild(buffer2[0], 1), CopyableChild(buffer2[1], 0)}; + auto jv = std::ranges::join_view(ParentView(children)); + auto iter = jv.begin(); + assert(*iter == 1); iter++; + assert(*iter == 2); iter++; + assert(*iter == 3); iter++; + assert(*iter == 4); iter++; + assert(*iter == 9); iter++; + assert(iter == jv.end()); + } + // Parent is empty. + { + CopyableChild children[4] = {CopyableChild(buffer1[0]), CopyableChild(buffer1[1]), CopyableChild(buffer2[0]), CopyableChild(buffer2[1])}; + std::ranges::join_view jv(ParentView(children, 0)); + assert(jv.begin() == jv.end()); + } + // Parent size is one. + { + CopyableChild children[1] = {CopyableChild(buffer1[0])}; + std::ranges::join_view jv(ParentView(children, 1)); + auto iter = jv.begin(); + assert(*iter == 1); iter++; + assert(*iter == 2); iter++; + assert(*iter == 3); iter++; + assert(*iter == 4); iter++; + assert(iter == jv.end()); + } + // Parent and child size is one. + { + CopyableChild children[1] = {CopyableChild(buffer1[0], 1)}; + std::ranges::join_view jv(ParentView(children, 1)); + auto iter = jv.begin(); + assert(*iter == 1); iter++; + assert(iter == jv.end()); + } + // Parent size is one child is empty + { + CopyableChild children[1] = {CopyableChild(buffer1[0], 0)}; + std::ranges::join_view jv(ParentView(children, 1)); + assert(jv.begin() == jv.end()); + } + // Has all empty children. + { + CopyableChild children[4] = {CopyableChild(buffer1[0], 0), CopyableChild(buffer1[1], 0), CopyableChild(buffer2[0], 0), CopyableChild(buffer2[1], 0)}; + auto jv = std::ranges::join_view(ParentView(children)); + assert(jv.begin() == jv.end()); + } + // First child is empty, others are not. + { + CopyableChild children[4] = {CopyableChild(buffer1[0], 4), CopyableChild(buffer1[1], 0), CopyableChild(buffer2[0], 0), CopyableChild(buffer2[1], 0)}; + auto jv = std::ranges::join_view(ParentView(children)); + auto iter = jv.begin(); + assert(*iter == 1); iter++; + assert(*iter == 2); iter++; + assert(*iter == 3); iter++; + assert(*iter == 4); iter++; + assert(iter == jv.end()); + } + // Last child is empty, others are not. + { + CopyableChild children[4] = {CopyableChild(buffer1[0], 4), CopyableChild(buffer1[1], 4), CopyableChild(buffer2[0], 4), CopyableChild(buffer2[1], 0)}; + auto jv = std::ranges::join_view(ParentView(children)); + auto iter = jv.begin(); + for (int i = 1; i < 13; ++i) { + assert(*iter == i); + iter++; + } + } + // operator++(); + { + std::ranges::join_view jv(buffer1); + auto iter = jv.begin(); + for (int i = 2; i < 9; ++i) { + assert(*++iter == i); + } + } + { + ValueView children[4] = {ValueView(buffer1[0]), ValueView(buffer1[1]), ValueView(buffer2[0]), ValueView(buffer2[1])}; + std::ranges::join_view jv(ValueView>{children}); + auto iter = jv.begin(); + for (int i = 2; i < 17; ++i) { + assert(*++iter == i); + } + + ASSERT_SAME_TYPE(decltype(++iter), decltype(iter)&); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.move.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.move.pass.cpp new file mode 100644 index 0000000000000..b3e3fd2dade71 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.move.pass.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// friend constexpr decltype(auto) iter_move(const iterator& i); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; + + std::ranges::join_view jv(buffer); + assert(std::ranges::iter_move(jv.begin()) == 1); + ASSERT_SAME_TYPE(decltype(std::ranges::iter_move(jv.begin())), int&&); + + static_assert(noexcept(std::ranges::iter_move(std::declval()))); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.swap.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.swap.pass.cpp new file mode 100644 index 0000000000000..30d61f516df89 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.swap.pass.cpp @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// friend constexpr void iter_swap(const iterator& x, const iterator& y); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; + + std::ranges::join_view jv(buffer); + auto iter1 = jv.begin(); + auto iter2 = std::next(jv.begin()); + assert(*iter1 == 1); + assert(*iter2 == 2); + std::ranges::swap(iter1, iter2); + assert(*iter1 == 2); + assert(*iter2 == 1); + + static_assert(noexcept(std::ranges::iter_swap(iter1, iter2))); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/member_types.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/member_types.compile.pass.cpp new file mode 100644 index 0000000000000..acf7ca17cd69b --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/member_types.compile.pass.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// Iterator traits and member typedefs in join_view::. + +#include + +#include "test_iterators.h" +#include "test_macros.h" +#include "../types.h" + +template +struct ForwardView : std::ranges::view_base { + friend forward_iterator begin(ForwardView&) { return forward_iterator(nullptr); } + friend forward_iterator begin(ForwardView const&) { return forward_iterator(nullptr); } + friend forward_iterator end(ForwardView&) { return forward_iterator(nullptr); } + friend forward_iterator end(ForwardView const&) { return forward_iterator(nullptr); } +}; + +template +struct InputView : std::ranges::view_base { + friend cpp17_input_iterator begin(InputView&) { return cpp17_input_iterator(nullptr); } + friend cpp17_input_iterator begin(InputView const&) { return cpp17_input_iterator(nullptr); } + friend cpp17_input_iterator end(InputView&) { return cpp17_input_iterator(nullptr); } + friend cpp17_input_iterator end(InputView const&) { return cpp17_input_iterator(nullptr); } +}; + +template +concept HasIterCategory = requires { typename T::iterator_category; }; + +void test() { + { + int buffer[4][4]; + std::ranges::join_view jv(buffer); + using Iter = std::ranges::iterator_t; + + ASSERT_SAME_TYPE(Iter::iterator_concept, std::bidirectional_iterator_tag); + ASSERT_SAME_TYPE(Iter::iterator_category, std::bidirectional_iterator_tag); + ASSERT_SAME_TYPE(Iter::difference_type, std::ptrdiff_t); + ASSERT_SAME_TYPE(Iter::value_type, int); + } + { + using Iter = std::ranges::iterator_t>>>; + + ASSERT_SAME_TYPE(Iter::iterator_concept, std::forward_iterator_tag); + ASSERT_SAME_TYPE(Iter::iterator_category, std::forward_iterator_tag); + ASSERT_SAME_TYPE(Iter::difference_type, std::ptrdiff_t); + ASSERT_SAME_TYPE(Iter::value_type, int); + } + { + using Iter = std::ranges::iterator_t>>>; + + ASSERT_SAME_TYPE(Iter::iterator_concept, std::input_iterator_tag); + static_assert(!HasIterCategory); + ASSERT_SAME_TYPE(Iter::difference_type, std::ptrdiff_t); + ASSERT_SAME_TYPE(Iter::value_type, int); + } +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/star.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/star.pass.cpp new file mode 100644 index 0000000000000..542c3309d59b6 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/star.pass.cpp @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr decltype(auto) operator*() const; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; + + { + std::ranges::join_view jv(buffer); + auto iter = jv.begin(); + for (int i = 1; i < 17; ++i) { + assert(*iter++ == i); + } + } + { + std::ranges::join_view jv(buffer); + auto iter = std::next(jv.begin(), 15); + assert(*iter++ == 16); + assert(iter == jv.end()); + } + { + ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])}; + auto jv = std::ranges::join_view(ParentView(children)); + auto iter = jv.begin(); + for (int i = 1; i < 17; ++i) { + assert(*iter == i); + ++iter; + } + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.default.pass.cpp new file mode 100644 index 0000000000000..74ab5c9af5a59 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.default.pass.cpp @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// sentinel() = default; + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + std::ranges::sentinel_t> sent; + (void) sent; + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.other.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.other.pass.cpp new file mode 100644 index 0000000000000..fae2edd53fb87 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.other.pass.cpp @@ -0,0 +1,41 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr sentinel(sentinel s); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; + + CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])}; + std::ranges::join_view jv(CopyableParent{children}); + auto sent1 = jv.end(); + std::ranges::sentinel_t sent2 = sent1; + (void) sent2; // We can't really do anything with these sentinels now :/ + + // We cannot create a non-const iterator from a const iterator. + static_assert(!std::constructible_from); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.parent.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.parent.pass.cpp new file mode 100644 index 0000000000000..fc813dbe563de --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.parent.pass.cpp @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// constexpr explicit sentinel(Parent& parent); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; + + CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])}; + CopyableParent parent{children}; + std::ranges::join_view jv(parent); + std::ranges::sentinel_t sent(jv); + assert(sent == std::ranges::next(jv.begin(), 16)); + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + { + // Test explicitness. + using Parent = std::ranges::join_view>; + static_assert( std::is_constructible_v, Parent&>); + static_assert(!std::is_convertible_v, Parent&>); + } + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/eq.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/eq.pass.cpp new file mode 100644 index 0000000000000..b33d13ff2df1e --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/eq.pass.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// template +// requires sentinel_for, iterator_t>> +// friend constexpr bool operator==(const iterator& x, const sentinel& y); + +#include +#include + +#include "test_macros.h" +#include "../types.h" + +constexpr bool test() { + int buffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}}; + + { + ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])}; + auto jv = std::ranges::join_view(ParentView(children)); + assert(jv.end() == std::ranges::next(jv.begin(), 16)); + } + { + CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])}; + const auto jv = std::ranges::join_view(ParentView(children)); + assert(jv.end() == std::ranges::next(jv.begin(), 16)); + } + { + CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])}; + const std::ranges::join_view jvc(CopyableParent{children}); + std::ranges::join_view jv(CopyableParent{children}); + assert(jvc.end() == std::ranges::next(jv.begin(), 16)); + assert( jv.end() == std::ranges::next(jvc.begin(), 16)); + } + + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + + return 0; +} diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/types.h b/libcxx/test/std/ranges/range.adaptors/range.join.view/types.h new file mode 100644 index 0000000000000..57f79a57485b3 --- /dev/null +++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/types.h @@ -0,0 +1,141 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_JOIN_TYPES_H +#define TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_JOIN_TYPES_H + +#include + +#include "test_macros.h" +#include "test_iterators.h" + +int globalBuffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}}; + +struct ChildView : std::ranges::view_base { + int *ptr_; + + constexpr ChildView(int *ptr = globalBuffer[0]) : ptr_(ptr) {} + ChildView(const ChildView&) = delete; + ChildView(ChildView&&) = default; + ChildView& operator=(const ChildView&) = delete; + ChildView& operator=(ChildView&&) = default; + + constexpr cpp20_input_iterator begin() { return cpp20_input_iterator(ptr_); } + constexpr cpp20_input_iterator begin() const { return cpp20_input_iterator(ptr_); } + constexpr int *end() { return ptr_ + 4; } + constexpr const int *end() const { return ptr_ + 4; } +}; + +constexpr bool operator==(const cpp20_input_iterator &lhs, int* rhs) { return lhs.base() == rhs; } +constexpr bool operator==(int* lhs, const cpp20_input_iterator &rhs) { return rhs.base() == lhs; } + +ChildView globalChildren[4] = {ChildView(globalBuffer[0]), ChildView(globalBuffer[1]), ChildView(globalBuffer[2]), ChildView(globalBuffer[3])}; + +template +struct ParentView : std::ranges::view_base { + T *ptr_; + unsigned size_; + + constexpr ParentView(T *ptr, unsigned size = 4) + : ptr_(ptr), size_(size) {} + constexpr ParentView(ChildView *ptr = globalChildren, unsigned size = 4) + requires std::same_as + : ptr_(ptr), size_(size) {} + ParentView(const ParentView&) = delete; + ParentView(ParentView&&) = default; + ParentView& operator=(const ParentView&) = delete; + ParentView& operator=(ParentView&&) = default; + + constexpr cpp20_input_iterator begin() { return cpp20_input_iterator(ptr_); } + constexpr cpp20_input_iterator begin() const { return cpp20_input_iterator(ptr_); } + constexpr T *end() { return ptr_ + size_; } + constexpr const T *end() const { return ptr_ + size_; } +}; + +template +constexpr bool operator==(const cpp20_input_iterator &lhs, T *rhs) { return lhs.base() == rhs; } +template +constexpr bool operator==(T *lhs, const cpp20_input_iterator &rhs) { return rhs.base() == lhs; } + +struct CopyableChild : std::ranges::view_base { + int *ptr_; + unsigned size_; + constexpr CopyableChild(int *ptr = globalBuffer[0], unsigned size = 4) + : ptr_(ptr), size_(size) {} + + constexpr cpp17_input_iterator begin() { return cpp17_input_iterator(ptr_); } + constexpr cpp17_input_iterator begin() const { return cpp17_input_iterator(ptr_); } + constexpr int *end() { return ptr_ + size_; } + constexpr const int *end() const { return ptr_ + size_; } +}; + +constexpr bool operator==(const cpp17_input_iterator &lhs, const int* rhs) { return lhs.base() == rhs; } +constexpr bool operator==(const int* lhs, const cpp17_input_iterator &rhs) { return rhs.base() == lhs; } + +struct CopyableParent : std::ranges::view_base { + CopyableChild *ptr_; + constexpr CopyableParent(CopyableChild *ptr) : ptr_(ptr) {} + + constexpr cpp17_input_iterator begin() { return cpp17_input_iterator(ptr_); } + constexpr cpp17_input_iterator begin() const { return cpp17_input_iterator(ptr_); } + constexpr CopyableChild *end() { return ptr_ + 4; } + constexpr const CopyableChild *end() const { return ptr_ + 4; } +}; + +constexpr bool operator==(const cpp17_input_iterator &lhs, const CopyableChild *rhs) { return lhs.base() == rhs; } +constexpr bool operator==(const CopyableChild *lhs, const cpp17_input_iterator &rhs) { return rhs.base() == lhs; } + +struct Box { int x; }; + +template +struct InputValueIter { + typedef std::input_iterator_tag iterator_category; + typedef T value_type; + typedef int difference_type; + typedef T reference; + + T *ptr_; + constexpr InputValueIter(T *ptr) : ptr_(ptr) {} + + constexpr T operator*() const { return std::move(*ptr_); } + constexpr void operator++(int) { ++ptr_; } + constexpr InputValueIter& operator++() { ++ptr_; return *this; } + + constexpr T *operator->() { return ptr_; } +}; + +template +constexpr bool operator==(const InputValueIter &lhs, const T* rhs) { return lhs.ptr_ == rhs; } +template +constexpr bool operator==(const T* lhs, const InputValueIter &rhs) { return rhs.ptr_ == lhs; } + +template +struct ValueView : std::ranges::view_base { + InputValueIter ptr_; + + constexpr ValueView(T *ptr) : ptr_(ptr) {} + + constexpr ValueView(ValueView &&other) + : ptr_(other.ptr_) { other.ptr_.ptr_ = nullptr; } + + constexpr ValueView& operator=(ValueView &&other) { + ptr_ = other.ptr_; + other.ptr_ = InputValueIter(nullptr); + return *this; + } + + ValueView(const ValueView&) = delete; + ValueView& operator=(const ValueView&) = delete; + + constexpr InputValueIter begin() { return ptr_; } + constexpr const InputValueIter begin() const { return ptr_; } + constexpr T *end() { return ptr_.ptr_ + 4; } + constexpr const T *end() const { return ptr_.ptr_ + 4; } +}; + +#endif // TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_JOIN_TYPES_H diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h index 12cdd36632c12..2381fb8607b40 100644 --- a/libcxx/test/support/test_iterators.h +++ b/libcxx/test/support/test_iterators.h @@ -914,6 +914,13 @@ class stride_counting_iterator { difference_type stride_displacement_ = 0; }; +template +concept sentinel_for_base = requires(U const& u) { + u.base(); + requires std::input_or_output_iterator>; + requires std::equality_comparable_with; +}; + template class sentinel_wrapper { public: @@ -927,6 +934,12 @@ class sentinel_wrapper { constexpr const I& base() const& { return base_; } constexpr I base() && { return std::move(base_); } + template + requires sentinel_for_base + constexpr bool operator==(I2 const& other) const { + return base_ == other.base(); + } + private: I base_ = I(); }; diff --git a/libcxx/test/support/test_range.h b/libcxx/test/support/test_range.h index c99e3f72e8bf9..6b279e21ce38e 100644 --- a/libcxx/test/support/test_range.h +++ b/libcxx/test/support/test_range.h @@ -62,4 +62,10 @@ struct test_view : std::ranges::view_base { sentinel end() const; }; +template class I, class R> +constexpr auto make_archetype_range(R&& r) { + return std::ranges::subrange(I(std::ranges::begin(r)), sentinel_wrapper(std::ranges::end(r))); +} + + #endif // LIBCXX_TEST_SUPPORT_TEST_RANGE_H From 95795e9d6e4c142fbb77faacc403f6bcaa72372e Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 13 Aug 2021 18:34:09 +0000 Subject: [PATCH 006/700] [gn build] Port 7b20e05c714e --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index f3bc8f98104c9..24002ac826008 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -285,6 +285,7 @@ if (current_toolchain == default_toolchain) { "__ranges/empty_view.h", "__ranges/enable_borrowed_range.h", "__ranges/enable_view.h", + "__ranges/iota_view.h", "__ranges/non_propagating_cache.h", "__ranges/ref_view.h", "__ranges/reverse_view.h", From a00eb480590a79ccaec20cd35cf11cf621bfdaf4 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 13 Aug 2021 18:34:09 +0000 Subject: [PATCH 007/700] [gn build] Port df324bba5c4c --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 24002ac826008..ccc7be9d192aa 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -286,6 +286,7 @@ if (current_toolchain == default_toolchain) { "__ranges/enable_borrowed_range.h", "__ranges/enable_view.h", "__ranges/iota_view.h", + "__ranges/join_view.h", "__ranges/non_propagating_cache.h", "__ranges/ref_view.h", "__ranges/reverse_view.h", From a9831cce1e7a74f507071073fcea29f81c413aac Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 13 Aug 2021 11:37:26 -0700 Subject: [PATCH 008/700] [NFC] Remove public uses of AttributeList::getAttributes() Use methods that better convey the intent. --- llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp | 15 +++++++-------- llvm/unittests/IR/AttributesTest.cpp | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp index dd96c763ce7ce..259b802bda63b 100644 --- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp +++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp @@ -203,21 +203,20 @@ struct AssumeBuilderState { } void addCall(const CallBase *Call) { - auto addAttrList = [&](AttributeList AttrList) { - for (unsigned Idx = AttributeList::FirstArgIndex; - Idx < AttrList.getNumAttrSets(); Idx++) - for (Attribute Attr : AttrList.getAttributes(Idx)) { + auto addAttrList = [&](AttributeList AttrList, unsigned NumArgs) { + for (unsigned Idx = 0; Idx < NumArgs; Idx++) + for (Attribute Attr : AttrList.getParamAttrs(Idx)) { bool IsPoisonAttr = Attr.hasAttribute(Attribute::NonNull) || Attr.hasAttribute(Attribute::Alignment); - if (!IsPoisonAttr || Call->isPassingUndefUB(Idx - 1)) - addAttribute(Attr, Call->getArgOperand(Idx - 1)); + if (!IsPoisonAttr || Call->isPassingUndefUB(Idx)) + addAttribute(Attr, Call->getArgOperand(Idx)); } for (Attribute Attr : AttrList.getFnAttrs()) addAttribute(Attr, nullptr); }; - addAttrList(Call->getAttributes()); + addAttrList(Call->getAttributes(), Call->arg_size()); if (Function *Fn = Call->getCalledFunction()) - addAttrList(Fn->getAttributes()); + addAttrList(Fn->getAttributes(), Fn->arg_size()); } AssumeInst *build() { diff --git a/llvm/unittests/IR/AttributesTest.cpp b/llvm/unittests/IR/AttributesTest.cpp index 4ba790058f8ce..30664b1d6fedb 100644 --- a/llvm/unittests/IR/AttributesTest.cpp +++ b/llvm/unittests/IR/AttributesTest.cpp @@ -54,7 +54,7 @@ TEST(Attributes, Ordering) { AttributeList::get(C, 1, Attribute::SExt)}; AttributeList SetA = AttributeList::get(C, ASs); - AttributeList SetB = SetA.removeAttributes(C, 1, ASs[1].getAttributes(1)); + AttributeList SetB = SetA.removeAttributes(C, 1, ASs[1].getParamAttrs(0)); EXPECT_NE(SetA, SetB); } From d7593ebaeeec2072abed73db0299a0e71f5c7a2c Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 13 Aug 2021 11:59:18 -0700 Subject: [PATCH 009/700] [NFC] Clean up users of AttributeList::hasAttribute() AttributeList::hasAttribute() is confusing, use clearer methods like hasParamAttr()/hasRetAttr(). Add hasRetAttr() since it was missing from AttributeList. --- llvm/include/llvm/IR/Attributes.h | 8 +++++++ llvm/include/llvm/IR/Function.h | 3 +-- llvm/include/llvm/IR/InstrTypes.h | 2 +- llvm/lib/AsmParser/LLParser.cpp | 2 +- .../CodeGen/GlobalISel/LegalizerHelper.cpp | 4 ++-- llvm/lib/CodeGen/MachinePipeliner.cpp | 3 +-- .../SelectionDAG/SelectionDAGBuilder.cpp | 9 +++---- llvm/lib/CodeGen/TargetLoweringBase.cpp | 10 ++++---- .../AArch64/GISel/AArch64CallLowering.cpp | 6 ++--- llvm/lib/Target/AVR/AVRISelLowering.cpp | 3 +-- .../Hexagon/HexagonOptimizeSZextends.cpp | 4 ++-- .../WebAssembly/WebAssemblyFastISel.cpp | 4 ++-- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++-- llvm/lib/Transforms/IPO/FunctionAttrs.cpp | 6 ++--- .../Scalar/RewriteStatepointsForGC.cpp | 2 +- llvm/unittests/IR/AttributesTest.cpp | 24 +++++++++---------- 16 files changed, 46 insertions(+), 48 deletions(-) diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h index 3b1cead212c85..dacab3938f557 100644 --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -652,6 +652,14 @@ class AttributeList { return hasAttributes(ArgNo + FirstArgIndex); } + /// Return true if the attribute exists for the return value. + bool hasRetAttr(Attribute::AttrKind Kind) const { + return hasAttribute(ReturnIndex, Kind); + } + + /// Return true if attributes exists for the return value. + bool hasRetAttrs() const { return hasAttributes(ReturnIndex); } + /// Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but /// may be faster. bool hasFnAttr(Attribute::AttrKind Kind) const; diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index d1c8a231d45aa..ce0bbb54d846f 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -699,8 +699,7 @@ class Function : public GlobalObject, public ilist_node { /// Determine if the parameter or return value is marked with NoAlias /// attribute. bool returnDoesNotAlias() const { - return AttributeSets.hasAttribute(AttributeList::ReturnIndex, - Attribute::NoAlias); + return AttributeSets.hasRetAttr(Attribute::NoAlias); } void setReturnDoesNotAlias() { addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index aab51f113fe38..65cada3d735de 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -1760,7 +1760,7 @@ class CallBase : public Instruction { /// Determine if the return value is marked with NoAlias attribute. bool returnDoesNotAlias() const { - return Attrs.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); + return Attrs.hasRetAttr(Attribute::NoAlias); } /// If one of the arguments has the 'returned' attribute, returns its diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 156b46cc94534..9ae10d76692a1 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -5571,7 +5571,7 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) { AttributeList::get(Context, AttributeSet::get(Context, FuncAttrs), AttributeSet::get(Context, RetAttrs), Attrs); - if (PAL.hasAttribute(1, Attribute::StructRet) && !RetType->isVoidTy()) + if (PAL.hasParamAttr(0, Attribute::StructRet) && !RetType->isVoidTy()) return error(RetTypeLoc, "functions with 'sret' argument must return void"); FunctionType *FT = FunctionType::get(RetType, ParamTypeList, IsVarArg); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index e12b3ed2b70ca..2ce32748453d2 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -497,8 +497,8 @@ static bool isLibCallInTailPosition(MachineInstr &MI, return false; // It's not safe to eliminate the sign / zero extension of the return value. - if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) || - CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) + if (CallerAttrs.hasRetAttr(Attribute::ZExt) || + CallerAttrs.hasRetAttr(Attribute::SExt)) return false; // Only tail call if the following instruction is a standard return or if we diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index caa3f8049aebd..f32c8a2978fe3 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -200,8 +200,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) { if (!EnableSWP) return false; - if (mf.getFunction().getAttributes().hasAttribute( - AttributeList::FunctionIndex, Attribute::OptimizeForSize) && + if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) && !EnableSWPOptSize.getPosition()) return false; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 0786dc395b409..9e21a213a77aa 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1953,16 +1953,13 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { /*IsVarArg*/ false, DL); ISD::NodeType ExtendKind = ISD::ANY_EXTEND; - if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, - Attribute::SExt)) + if (F->getAttributes().hasRetAttr(Attribute::SExt)) ExtendKind = ISD::SIGN_EXTEND; - else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, - Attribute::ZExt)) + else if (F->getAttributes().hasRetAttr(Attribute::ZExt)) ExtendKind = ISD::ZERO_EXTEND; LLVMContext &Context = F->getContext(); - bool RetInReg = F->getAttributes().hasAttribute( - AttributeList::ReturnIndex, Attribute::InReg); + bool RetInReg = F->getAttributes().hasRetAttr(Attribute::InReg); for (unsigned j = 0; j != NumValues; ++j) { EVT VT = ValueVTs[j]; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index d11c6b05aad7c..0b817c99d20a5 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1657,9 +1657,9 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType, EVT VT = ValueVTs[j]; ISD::NodeType ExtendKind = ISD::ANY_EXTEND; - if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) + if (attr.hasRetAttr(Attribute::SExt)) ExtendKind = ISD::SIGN_EXTEND; - else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt)) + else if (attr.hasRetAttr(Attribute::ZExt)) ExtendKind = ISD::ZERO_EXTEND; // FIXME: C calling convention requires the return type to be promoted to @@ -1679,13 +1679,13 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType, // 'inreg' on function refers to return value ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy(); - if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::InReg)) + if (attr.hasRetAttr(Attribute::InReg)) Flags.setInReg(); // Propagate extension type if any - if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)) + if (attr.hasRetAttr(Attribute::SExt)) Flags.setSExt(); - else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt)) + else if (attr.hasRetAttr(Attribute::ZExt)) Flags.setZExt(); for (unsigned i = 0; i < NumParts; ++i) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index df5dd59df190b..5ad2c6e6e7a08 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -387,11 +387,9 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, MVT NewVT = TLI.getRegisterTypeForCallingConv(Ctx, CC, SplitEVTs[i]); if (EVT(NewVT) != SplitEVTs[i]) { unsigned ExtendOp = TargetOpcode::G_ANYEXT; - if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex, - Attribute::SExt)) + if (F.getAttributes().hasRetAttr(Attribute::SExt)) ExtendOp = TargetOpcode::G_SEXT; - else if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex, - Attribute::ZExt)) + else if (F.getAttributes().hasRetAttr(Attribute::ZExt)) ExtendOp = TargetOpcode::G_ZEXT; LLT NewLLT(NewVT); diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp index 58a7aed91cdf3..ce50ed0bcfd74 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -1488,8 +1488,7 @@ AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Don't emit the ret/reti instruction when the naked attribute is present in // the function being compiled. - if (MF.getFunction().getAttributes().hasAttribute( - AttributeList::FunctionIndex, Attribute::Naked)) { + if (MF.getFunction().getAttributes().hasFnAttr(Attribute::Naked)) { return Chain; } diff --git a/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp index e026bb6d601d0..1941c0f35e5b4 100644 --- a/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp +++ b/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp @@ -67,12 +67,12 @@ bool HexagonOptimizeSZextends::runOnFunction(Function &F) { if (skipFunction(F)) return false; - unsigned Idx = 1; + unsigned Idx = 0; // Try to optimize sign extends in formal parameters. It's relying on // callee already sign extending the values. I'm not sure if our ABI // requires callee to sign extend though. for (auto &Arg : F.args()) { - if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) { + if (F.getAttributes().hasParamAttr(Idx, Attribute::SExt)) { if (!isa(Arg.getType())) { for (auto UI = Arg.use_begin(); UI != Arg.use_end();) { if (isa(*UI)) { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index c40e4f903df58..4495d624e09d1 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -1370,9 +1370,9 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) { } unsigned Reg; - if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::SExt)) + if (FuncInfo.Fn->getAttributes().hasRetAttr(Attribute::SExt)) Reg = getRegForSignedValue(RV); - else if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::ZExt)) + else if (FuncInfo.Fn->getAttributes().hasRetAttr(Attribute::ZExt)) Reg = getRegForUnsignedValue(RV); else Reg = getRegForValue(RV); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 72e9a2b4b29a7..6614978b25d1a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27235,11 +27235,11 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, if (!Attrs.isEmpty() && !Func->isVarArg()) { unsigned InRegCount = 0; - unsigned Idx = 1; + unsigned Idx = 0; for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) - if (Attrs.hasAttribute(Idx, Attribute::InReg)) { + if (Attrs.hasParamAttr(Idx, Attribute::InReg)) { const DataLayout &DL = DAG.getDataLayout(); // FIXME: should only count parameters that are lowered to integers. InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index ca8660a98ded8..eae971dd32af7 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -1055,8 +1055,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) { // pointers. for (Function *F : SCCNodes) { // Already nonnull. - if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, - Attribute::NonNull)) + if (F->getAttributes().hasRetAttr(Attribute::NonNull)) continue; // We can infer and propagate function attributes only when we know that the @@ -1090,8 +1089,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) { if (SCCReturnsNonNull) { for (Function *F : SCCNodes) { - if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, - Attribute::NonNull) || + if (F->getAttributes().hasRetAttr(Attribute::NonNull) || !F->getReturnType()->isPointerTy()) continue; diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 4be07558d42df..77701bf1d0c65 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1533,7 +1533,7 @@ static StringRef getDeoptLowering(CallBase *Call) { // FIXME: Calls have a *really* confusing interface around attributes // with values. const AttributeList &CSAS = Call->getAttributes(); - if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering)) + if (CSAS.hasFnAttr(DeoptLowering)) return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering) .getValueAsString(); Function *F = Call->getCalledFunction(); diff --git a/llvm/unittests/IR/AttributesTest.cpp b/llvm/unittests/IR/AttributesTest.cpp index 30664b1d6fedb..86e7389877266 100644 --- a/llvm/unittests/IR/AttributesTest.cpp +++ b/llvm/unittests/IR/AttributesTest.cpp @@ -68,7 +68,7 @@ TEST(Attributes, AddAttributes) { B.clear(); B.addAttribute(Attribute::SExt); AL = AL.addAttributes(C, AttributeList::ReturnIndex, B); - EXPECT_TRUE(AL.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt)); + EXPECT_TRUE(AL.hasRetAttr(Attribute::SExt)); EXPECT_TRUE(AL.hasFnAttr(Attribute::NoReturn)); } @@ -102,9 +102,9 @@ TEST(Attributes, RemoveAlign) { AttributeList AL; AL = AL.addParamAttributes(C, 0, B_align_readonly); AL = AL.addAttributes(C, 0, B_stackalign_optnone); - EXPECT_TRUE(AL.hasAttributes(0)); - EXPECT_TRUE(AL.hasAttribute(0, Attribute::StackAlignment)); - EXPECT_TRUE(AL.hasAttribute(0, Attribute::OptimizeNone)); + EXPECT_TRUE(AL.hasRetAttrs()); + EXPECT_TRUE(AL.hasRetAttr(Attribute::StackAlignment)); + EXPECT_TRUE(AL.hasRetAttr(Attribute::OptimizeNone)); EXPECT_TRUE(AL.getStackAlignment(0) == 32); EXPECT_TRUE(AL.hasParamAttrs(0)); EXPECT_TRUE(AL.hasParamAttr(0, Attribute::Alignment)); @@ -114,15 +114,15 @@ TEST(Attributes, RemoveAlign) { AL = AL.removeParamAttribute(C, 0, Attribute::Alignment); EXPECT_FALSE(AL.hasParamAttr(0, Attribute::Alignment)); EXPECT_TRUE(AL.hasParamAttr(0, Attribute::ReadOnly)); - EXPECT_TRUE(AL.hasAttribute(0, Attribute::StackAlignment)); - EXPECT_TRUE(AL.hasAttribute(0, Attribute::OptimizeNone)); + EXPECT_TRUE(AL.hasRetAttr(Attribute::StackAlignment)); + EXPECT_TRUE(AL.hasRetAttr(Attribute::OptimizeNone)); EXPECT_TRUE(AL.getStackAlignment(0) == 32); AL = AL.removeAttribute(C, 0, Attribute::StackAlignment); EXPECT_FALSE(AL.hasParamAttr(0, Attribute::Alignment)); EXPECT_TRUE(AL.hasParamAttr(0, Attribute::ReadOnly)); - EXPECT_FALSE(AL.hasAttribute(0, Attribute::StackAlignment)); - EXPECT_TRUE(AL.hasAttribute(0, Attribute::OptimizeNone)); + EXPECT_FALSE(AL.hasRetAttr(Attribute::StackAlignment)); + EXPECT_TRUE(AL.hasRetAttr(Attribute::OptimizeNone)); AttributeList AL2; AL2 = AL2.addParamAttributes(C, 0, B_align_readonly); @@ -131,15 +131,15 @@ TEST(Attributes, RemoveAlign) { AL2 = AL2.removeParamAttributes(C, 0, B_align); EXPECT_FALSE(AL2.hasParamAttr(0, Attribute::Alignment)); EXPECT_TRUE(AL2.hasParamAttr(0, Attribute::ReadOnly)); - EXPECT_TRUE(AL2.hasAttribute(0, Attribute::StackAlignment)); - EXPECT_TRUE(AL2.hasAttribute(0, Attribute::OptimizeNone)); + EXPECT_TRUE(AL2.hasRetAttr(Attribute::StackAlignment)); + EXPECT_TRUE(AL2.hasRetAttr(Attribute::OptimizeNone)); EXPECT_TRUE(AL2.getStackAlignment(0) == 32); AL2 = AL2.removeAttributes(C, 0, B_stackalign); EXPECT_FALSE(AL2.hasParamAttr(0, Attribute::Alignment)); EXPECT_TRUE(AL2.hasParamAttr(0, Attribute::ReadOnly)); - EXPECT_FALSE(AL2.hasAttribute(0, Attribute::StackAlignment)); - EXPECT_TRUE(AL2.hasAttribute(0, Attribute::OptimizeNone)); + EXPECT_FALSE(AL2.hasRetAttr(Attribute::StackAlignment)); + EXPECT_TRUE(AL2.hasRetAttr(Attribute::OptimizeNone)); } TEST(Attributes, AddMatchingAlignAttr) { From 8e9ffa1dc6988367c8e3d688044859c5aa7cf485 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 13 Aug 2021 12:07:05 -0700 Subject: [PATCH 010/700] [NFC] Cleanup callers of AttributeList::hasAttributes() AttributeList::hasAttributes() is confusing, use clearer methods like hasFnAttrs(). --- clang/lib/CodeGen/CodeGenModule.cpp | 2 +- llvm/include/llvm/IR/Attributes.h | 11 ++++++----- llvm/lib/IR/AsmWriter.cpp | 18 +++++++++--------- llvm/lib/IR/Verifier.cpp | 2 +- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index fa8312d30ad2d..ebd3cc7d7dcc2 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -3613,7 +3613,7 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction( assert(F->getName() == MangledName && "name was uniqued!"); if (D) SetFunctionAttributes(GD, F, IsIncompleteFunction, IsThunk); - if (ExtraAttrs.hasAttributes(llvm::AttributeList::FunctionIndex)) { + if (ExtraAttrs.hasFnAttrs()) { llvm::AttrBuilder B(ExtraAttrs, llvm::AttributeList::FunctionIndex); F->addAttributes(llvm::AttributeList::FunctionIndex, B); } diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h index dacab3938f557..08220b1d6413b 100644 --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -657,17 +657,18 @@ class AttributeList { return hasAttribute(ReturnIndex, Kind); } - /// Return true if attributes exists for the return value. + /// Return true if attributes exist for the return value. bool hasRetAttrs() const { return hasAttributes(ReturnIndex); } - /// Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but - /// may be faster. + /// Return true if the attribute exists for the function. bool hasFnAttr(Attribute::AttrKind Kind) const; - /// Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but - /// may be faster. + /// Return true if the attribute exists for the function. bool hasFnAttr(StringRef Kind) const; + /// Return true the attributes exist for the function. + bool hasFnAttrs() const { return hasAttributes(FunctionIndex); } + /// Return true if the specified attribute is set for at least one /// parameter or for the return value. If Index is not nullptr, the index /// of a parameter with the specified attribute is provided. diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index b3c11133e8308..6ad28e9099ea0 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -3682,7 +3682,7 @@ void AssemblyWriter::printFunction(const Function *F) { Out << "; Materializable\n"; const AttributeList &Attrs = F->getAttributes(); - if (Attrs.hasAttributes(AttributeList::FunctionIndex)) { + if (Attrs.hasFnAttrs()) { AttributeSet AS = Attrs.getFnAttrs(); std::string AttrStr; @@ -3720,7 +3720,7 @@ void AssemblyWriter::printFunction(const Function *F) { } FunctionType *FT = F->getFunctionType(); - if (Attrs.hasAttributes(AttributeList::ReturnIndex)) + if (Attrs.hasRetAttrs()) Out << Attrs.getAsString(AttributeList::ReturnIndex) << ' '; TypePrinter.print(F->getReturnType(), Out); Out << ' '; @@ -3769,7 +3769,7 @@ void AssemblyWriter::printFunction(const Function *F) { if (F->getAddressSpace() != 0 || !Mod || Mod->getDataLayout().getProgramAddressSpace() != 0) Out << " addrspace(" << F->getAddressSpace() << ")"; - if (Attrs.hasAttributes(AttributeList::FunctionIndex)) + if (Attrs.hasFnAttrs()) Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttrs()); if (F->hasSection()) { Out << " section \""; @@ -4126,7 +4126,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) { Type *RetTy = FTy->getReturnType(); const AttributeList &PAL = CI->getAttributes(); - if (PAL.hasAttributes(AttributeList::ReturnIndex)) + if (PAL.hasRetAttrs()) Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex); // Only print addrspace(N) if necessary: @@ -4155,7 +4155,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) { Out << ", ..."; Out << ')'; - if (PAL.hasAttributes(AttributeList::FunctionIndex)) + if (PAL.hasFnAttrs()) Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs()); writeOperandBundles(CI); @@ -4171,7 +4171,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) { PrintCallingConv(II->getCallingConv(), Out); } - if (PAL.hasAttributes(AttributeList::ReturnIndex)) + if (PAL.hasRetAttrs()) Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex); // Only print addrspace(N) if necessary: @@ -4193,7 +4193,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) { } Out << ')'; - if (PAL.hasAttributes(AttributeList::FunctionIndex)) + if (PAL.hasFnAttrs()) Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs()); writeOperandBundles(II); @@ -4214,7 +4214,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) { PrintCallingConv(CBI->getCallingConv(), Out); } - if (PAL.hasAttributes(AttributeList::ReturnIndex)) + if (PAL.hasRetAttrs()) Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex); // If possible, print out the short form of the callbr instruction. We can @@ -4233,7 +4233,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) { } Out << ')'; - if (PAL.hasAttributes(AttributeList::FunctionIndex)) + if (PAL.hasFnAttrs()) Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs()); writeOperandBundles(CBI); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 5fb1b99792791..e7857b93333f7 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -1928,7 +1928,7 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, } } - if (!Attrs.hasAttributes(AttributeList::FunctionIndex)) + if (!Attrs.hasFnAttrs()) return; verifyAttributeTypes(Attrs.getFnAttrs(), V); From e33f301ec220bf5349692126b4cf5597e08185dd Mon Sep 17 00:00:00 2001 From: harsh-nod Date: Fri, 13 Aug 2021 12:54:30 -0700 Subject: [PATCH 011/700] [mlir] Add support for moving reductions to outer most dimensions in vector.multi_reduction The approach for handling reductions in the outer most dimension follows that for inner most dimensions, outlined below First, transpose to move reduction dims, if needed Convert reduction from n-d to 2-d canonical form Then, for outer reductions, we emit the appropriate op (add/mul/min/max/or/and/xor) and combine the results. Differential Revision: https://reviews.llvm.org/D107675 --- mlir/include/mlir/Dialect/Vector/VectorOps.h | 3 +- mlir/lib/Dialect/Vector/VectorTransforms.cpp | 217 ++++++++++++++---- ...vector-multi-reduction-outer-lowering.mlir | 161 +++++++++++++ .../Dialect/Vector/TestVectorTransforms.cpp | 9 +- 4 files changed, 340 insertions(+), 50 deletions(-) create mode 100644 mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.h b/mlir/include/mlir/Dialect/Vector/VectorOps.h index cf53e8fcff97c..9bc2cd4e35acf 100644 --- a/mlir/include/mlir/Dialect/Vector/VectorOps.h +++ b/mlir/include/mlir/Dialect/Vector/VectorOps.h @@ -81,7 +81,8 @@ void populateVectorMaskMaterializationPatterns(RewritePatternSet &patterns, // Collect a set of patterns to convert vector.multi_reduction op into // a sequence of vector.reduction ops. -void populateVectorMultiReductionLoweringPatterns(RewritePatternSet &patterns); +void populateVectorMultiReductionLoweringPatterns( + RewritePatternSet &patterns, bool useInnerDimsForReduction = false); /// Collect a set of patterns to propagate insert_map/extract_map in the ssa /// chain. diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp index 278cd6d639cfa..f3ad31c042f6f 100644 --- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp @@ -3490,12 +3490,18 @@ class VectorCreateMaskOpConversion const bool enableIndexOptimizations; }; -// Converts vector.multi_reduction into inner-most reduction form by inserting -// vector.transpose -struct InnerDimReductionConversion +// Converts vector.multi_reduction into inner-most/outer-most reduction form +// by using vector.tranpose +class InnerOuterDimReductionConversion : public OpRewritePattern { +public: using OpRewritePattern::OpRewritePattern; + explicit InnerOuterDimReductionConversion(MLIRContext *context, + bool useInnerDimsForReduction) + : mlir::OpRewritePattern(context), + useInnerDimsForReduction(useInnerDimsForReduction) {} + LogicalResult matchAndRewrite(vector::MultiDimReductionOp multiReductionOp, PatternRewriter &rewriter) const override { auto src = multiReductionOp.source(); @@ -3516,87 +3522,116 @@ struct InnerDimReductionConversion parallelDims.push_back(i); } - // Add transpose only if inner-most dimensions are not reductions - if (parallelDims == - llvm::to_vector<4>(llvm::seq(0, parallelDims.size()))) + // Add transpose only if inner-most/outer-most dimensions are not parallel + if (useInnerDimsForReduction && + (parallelDims == + llvm::to_vector<4>(llvm::seq(0, parallelDims.size())))) + return failure(); + + if (!useInnerDimsForReduction && + (parallelDims != + llvm::to_vector<4>(llvm::seq(0, parallelDims.size())))) return failure(); SmallVector indices; - indices.append(parallelDims.begin(), parallelDims.end()); - indices.append(reductionDims.begin(), reductionDims.end()); + if (useInnerDimsForReduction) { + indices.append(parallelDims.begin(), parallelDims.end()); + indices.append(reductionDims.begin(), reductionDims.end()); + } else { + indices.append(reductionDims.begin(), reductionDims.end()); + indices.append(parallelDims.begin(), parallelDims.end()); + } auto transposeOp = rewriter.create(loc, src, indices); SmallVector reductionMask(srcRank, false); for (int i = 0; i < reductionSize; ++i) { - reductionMask[srcRank - i - 1] = true; + if (useInnerDimsForReduction) + reductionMask[srcRank - i - 1] = true; + else + reductionMask[i] = true; } rewriter.replaceOpWithNewOp( multiReductionOp, transposeOp.result(), reductionMask, multiReductionOp.kind()); return success(); } + +private: + const bool useInnerDimsForReduction; }; // Reduces the rank of vector.mult_reduction nd -> 2d given all reduction -// dimensions are inner most. -struct ReduceMultiDimReductionRank +// dimensions are either inner most or outer most. +class ReduceMultiDimReductionRank : public OpRewritePattern { +public: using OpRewritePattern::OpRewritePattern; + explicit ReduceMultiDimReductionRank(MLIRContext *context, + bool useInnerDimsForReduction) + : mlir::OpRewritePattern(context), + useInnerDimsForReduction(useInnerDimsForReduction) {} + LogicalResult matchAndRewrite(vector::MultiDimReductionOp multiReductionOp, PatternRewriter &rewriter) const override { auto srcRank = multiReductionOp.getSourceVectorType().getRank(); auto srcShape = multiReductionOp.getSourceVectorType().getShape(); + auto loc = multiReductionOp.getLoc(); if (srcRank == 2) return failure(); - auto loc = multiReductionOp.getLoc(); - auto reductionDims = llvm::to_vector<4>( - llvm::map_range(multiReductionOp.reduction_dims().cast(), - [](Attribute attr) -> int64_t { - return attr.cast().getInt(); - })); - llvm::sort(reductionDims); - - // Fails if not inner most reduction. - int64_t reductionSize = reductionDims.size(); - bool innerMostReduction = true; - for (int i = 0; i < reductionSize; ++i) { - if (reductionDims[reductionSize - i - 1] != srcRank - i - 1) { - innerMostReduction = false; + // Separate reduction and parallel dims + auto reductionDimsRange = + multiReductionOp.reduction_dims().getAsValueRange(); + auto reductionDims = llvm::to_vector<4>(llvm::map_range( + reductionDimsRange, [](APInt a) { return a.getZExtValue(); })); + llvm::SmallDenseSet reductionDimsSet(reductionDims.begin(), + reductionDims.end()); + SmallVector parallelDims, parallelShapes; + int canonicalReductionDim = 1; + int canonicalParallelDim = 1; + for (int64_t i = 0; i < srcRank; i++) { + if (!reductionDimsSet.contains(i)) { + parallelDims.push_back(i); + parallelShapes.push_back(srcShape[i]); + canonicalParallelDim *= srcShape[i]; + } else { + canonicalReductionDim *= srcShape[i]; } } - if (!innerMostReduction) + + // Fail if reduction dims are not either inner-most or outer-most + if (useInnerDimsForReduction && + (parallelDims != + llvm::to_vector<4>(llvm::seq(0, parallelDims.size())))) return failure(); - // Extracts 2d rank reduction shape. - int innerDims = 1; - int outterDims = 1; - SmallVector innerDimsShape; - for (int i = 0; i < srcRank; ++i) { - if (i < (srcRank - reductionSize)) { - innerDims *= srcShape[i]; - innerDimsShape.push_back(srcShape[i]); - } else { - outterDims *= srcShape[i]; - } - } + if (!useInnerDimsForReduction && + (parallelDims == + llvm::to_vector<4>(llvm::seq(0, parallelDims.size())))) + return failure(); // Creates shape cast for the inputs n_d -> 2d + int64_t outerDim = + useInnerDimsForReduction ? canonicalParallelDim : canonicalReductionDim; + int64_t innerDim = + useInnerDimsForReduction ? canonicalReductionDim : canonicalParallelDim; + auto castedType = VectorType::get( - {innerDims, outterDims}, + ArrayRef{outerDim, innerDim}, multiReductionOp.getSourceVectorType().getElementType()); auto castedOp = rewriter.create( loc, castedType, multiReductionOp.source()); - // Creates the canonical form of 2d vector.multi_reduction with inner most - // dim as reduction. + // Creates the canonical form of 2d vector.multi_reduction with inner/outer + // most dim as reduction. + SmallVector mask{!useInnerDimsForReduction, + useInnerDimsForReduction}; auto newOp = rewriter.create( - loc, castedOp.result(), ArrayRef{false, true}, - multiReductionOp.kind()); + loc, castedOp.result(), mask, multiReductionOp.kind()); // Creates shape cast for the output 2d -> nd - auto outputCastedType = VectorType::get( - innerDimsShape, + VectorType outputCastedType = VectorType::get( + parallelShapes, multiReductionOp.getSourceVectorType().getElementType()); Value castedOutputOp = rewriter.create( loc, outputCastedType, newOp.dest()); @@ -3604,6 +3639,88 @@ struct ReduceMultiDimReductionRank rewriter.replaceOp(multiReductionOp, castedOutputOp); return success(); } + +private: + const bool useInnerDimsForReduction; +}; + +// Unrolls vector.multi_reduction with outermost reductions +// and combines results +struct UnrollOuterMultiReduction + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(vector::MultiDimReductionOp multiReductionOp, + PatternRewriter &rewriter) const override { + auto srcRank = multiReductionOp.getSourceVectorType().getRank(); + if (srcRank != 2) + return failure(); + + if (multiReductionOp.getReductionMask()[1] || + !multiReductionOp.getReductionMask()[0]) + return failure(); + + auto loc = multiReductionOp.getLoc(); + ArrayRef srcShape = + multiReductionOp.getSourceVectorType().getShape(); + + Type elementType = multiReductionOp.getDestVectorType().getElementType(); + if (!elementType.isIntOrIndexOrFloat()) + return failure(); + + Value condition; + Value result = + rewriter.create(loc, multiReductionOp.source(), 0) + .getResult(); + for (int64_t i = 1; i < srcShape[0]; i++) { + auto operand = + rewriter.create(loc, multiReductionOp.source(), i); + switch (multiReductionOp.kind()) { + case vector::CombiningKind::ADD: + if (elementType.isIntOrIndex()) + result = rewriter.create(loc, operand, result); + else + result = rewriter.create(loc, operand, result); + break; + case vector::CombiningKind::MUL: + if (elementType.isIntOrIndex()) + result = rewriter.create(loc, operand, result); + else + result = rewriter.create(loc, operand, result); + break; + case vector::CombiningKind::MIN: + if (elementType.isIntOrIndex()) + condition = + rewriter.create(loc, CmpIPredicate::slt, operand, result); + else + condition = + rewriter.create(loc, CmpFPredicate::OLT, operand, result); + result = rewriter.create(loc, condition, operand, result); + break; + case vector::CombiningKind::MAX: + if (elementType.isIntOrIndex()) + condition = + rewriter.create(loc, CmpIPredicate::sge, operand, result); + else + condition = + rewriter.create(loc, CmpFPredicate::OGE, operand, result); + result = rewriter.create(loc, condition, operand, result); + break; + case vector::CombiningKind::AND: + result = rewriter.create(loc, operand, result); + break; + case vector::CombiningKind::OR: + result = rewriter.create(loc, operand, result); + break; + case vector::CombiningKind::XOR: + result = rewriter.create(loc, operand, result); + break; + } + } + + rewriter.replaceOp(multiReductionOp, result); + return success(); + } }; // Converts 2d vector.multi_reduction with inner most reduction dimension into a @@ -3747,9 +3864,13 @@ void mlir::vector::populateVectorTransferLoweringPatterns( } void mlir::vector::populateVectorMultiReductionLoweringPatterns( - RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); + RewritePatternSet &patterns, bool useInnerDimsForReduction) { + patterns.add( + patterns.getContext(), useInnerDimsForReduction); + if (useInnerDimsForReduction) + patterns.add(patterns.getContext()); + else + patterns.add(patterns.getContext()); } void mlir::vector::populateVectorUnrollPatterns( diff --git a/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir b/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir new file mode 100644 index 0000000000000..91dcc2e0172f7 --- /dev/null +++ b/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir @@ -0,0 +1,161 @@ +// RUN: mlir-opt %s -test-vector-multi-reduction-lowering-patterns="use-outer-reductions" | FileCheck %s + +func @vector_multi_reduction(%arg0: vector<2x4xf32>) -> vector<2xf32> { + %0 = vector.multi_reduction #vector.kind, %arg0 [1] : vector<2x4xf32> to vector<2xf32> + return %0 : vector<2xf32> +} + +// CHECK-LABEL: func @vector_multi_reduction +// CHECK-SAME: %[[INPUT:.+]]: vector<2x4xf32> +// CHECK: %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [1, 0] : vector<2x4xf32> to vector<4x2xf32> +// CHECK: %[[V0:.+]] = vector.extract %[[TRANSPOSED]][0] : vector<4x2xf32> +// CHECK: %[[V1:.+]] = vector.extract %[[TRANSPOSED]][1] : vector<4x2xf32> +// CHECK: %[[RV01:.+]] = mulf %[[V1]], %[[V0]] : vector<2xf32> +// CHECK: %[[V2:.+]] = vector.extract %[[TRANSPOSED]][2] : vector<4x2xf32> +// CHECK: %[[RV012:.+]] = mulf %[[V2]], %[[RV01]] : vector<2xf32> +// CHECK: %[[V3:.+]] = vector.extract %[[TRANSPOSED]][3] : vector<4x2xf32> +// CHECK: %[[RESULT_VEC:.+]] = mulf %[[V3]], %[[RV012]] : vector<2xf32> +// CHECK: return %[[RESULT_VEC]] : vector<2xf32> + +func @vector_multi_reduction_min(%arg0: vector<2x4xf32>) -> vector<2xf32> { + %0 = vector.multi_reduction #vector.kind, %arg0 [1] : vector<2x4xf32> to vector<2xf32> + return %0 : vector<2xf32> +} + +// CHECK-LABEL: func @vector_multi_reduction_min +// CHECK-SAME: %[[INPUT:.+]]: vector<2x4xf32> +// CHECK: %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [1, 0] : vector<2x4xf32> to vector<4x2xf32> +// CHECK: %[[V0:.+]] = vector.extract %[[TRANSPOSED]][0] : vector<4x2xf32> +// CHECK: %[[V1:.+]] = vector.extract %[[TRANSPOSED]][1] : vector<4x2xf32> +// CHECK: %[[C0:.+]] = cmpf olt, %[[V1]], %[[V0]] : vector<2xf32> +// CHECK: %[[RV01:.+]] = select %[[C0]], %[[V1]], %[[V0]] : vector<2xi1>, vector<2xf32> +// CHECK: %[[V2:.+]] = vector.extract %[[TRANSPOSED]][2] : vector<4x2xf32> +// CHECK: %[[C1:.+]] = cmpf olt, %[[V2]], %[[RV01]] : vector<2xf32> +// CHECK: %[[RV012:.+]] = select %[[C1]], %[[V2]], %[[RV01]] : vector<2xi1>, vector<2xf32> +// CHECK: %[[V3:.+]] = vector.extract %[[TRANSPOSED]][3] : vector<4x2xf32> +// CHECK: %[[C2:.+]] = cmpf olt, %[[V3]], %[[RV012]] : vector<2xf32> +// CHECK: %[[RESULT_VEC:.+]] = select %[[C2]], %[[V3]], %[[RV012]] : vector<2xi1>, vector<2xf32> +// CHECK: return %[[RESULT_VEC]] : vector<2xf32> + +func @vector_multi_reduction_max(%arg0: vector<2x4xf32>) -> vector<2xf32> { + %0 = vector.multi_reduction #vector.kind, %arg0 [1] : vector<2x4xf32> to vector<2xf32> + return %0 : vector<2xf32> +} + +// CHECK-LABEL: func @vector_multi_reduction_max +// CHECK-SAME: %[[INPUT:.+]]: vector<2x4xf32> +// CHECK: %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [1, 0] : vector<2x4xf32> to vector<4x2xf32> +// CHECK: %[[V0:.+]] = vector.extract %[[TRANSPOSED]][0] : vector<4x2xf32> +// CHECK: %[[V1:.+]] = vector.extract %[[TRANSPOSED]][1] : vector<4x2xf32> +// CHECK: %[[C0:.+]] = cmpf oge, %[[V1]], %[[V0]] : vector<2xf32> +// CHECK: %[[RV01:.+]] = select %[[C0]], %[[V1]], %[[V0]] : vector<2xi1>, vector<2xf32> +// CHECK: %[[V2:.+]] = vector.extract %[[TRANSPOSED]][2] : vector<4x2xf32> +// CHECK: %[[C1:.+]] = cmpf oge, %[[V2]], %[[RV01]] : vector<2xf32> +// CHECK: %[[RV012:.+]] = select %[[C1]], %[[V2]], %[[RV01]] : vector<2xi1>, vector<2xf32> +// CHECK: %[[V3:.+]] = vector.extract %[[TRANSPOSED]][3] : vector<4x2xf32> +// CHECK: %[[C2:.+]] = cmpf oge, %[[V3]], %[[RV012]] : vector<2xf32> +// CHECK: %[[RESULT_VEC:.+]] = select %[[C2]], %[[V3]], %[[RV012]] : vector<2xi1>, vector<2xf32> +// CHECK: return %[[RESULT_VEC]] : vector<2xf32> + +func @vector_multi_reduction_and(%arg0: vector<2x4xi32>) -> vector<2xi32> { + %0 = vector.multi_reduction #vector.kind, %arg0 [1] : vector<2x4xi32> to vector<2xi32> + return %0 : vector<2xi32> +} + +// CHECK-LABEL: func @vector_multi_reduction_and +// CHECK-SAME: %[[INPUT:.+]]: vector<2x4xi32> +// CHECK: %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [1, 0] : vector<2x4xi32> to vector<4x2xi32> +// CHECK: %[[V0:.+]] = vector.extract %[[TRANSPOSED]][0] : vector<4x2xi32> +// CHECK: %[[V1:.+]] = vector.extract %[[TRANSPOSED]][1] : vector<4x2xi32> +// CHECK: %[[RV01:.+]] = and %[[V1]], %[[V0]] : vector<2xi32> +// CHECK: %[[V2:.+]] = vector.extract %[[TRANSPOSED]][2] : vector<4x2xi32> +// CHECK: %[[RV012:.+]] = and %[[V2]], %[[RV01]] : vector<2xi32> +// CHECK: %[[V3:.+]] = vector.extract %[[TRANSPOSED]][3] : vector<4x2xi32> +// CHECK: %[[RESULT_VEC:.+]] = and %[[V3]], %[[RV012]] : vector<2xi32> +// CHECK: return %[[RESULT_VEC]] : vector<2xi32> + +func @vector_multi_reduction_or(%arg0: vector<2x4xi32>) -> vector<2xi32> { + %0 = vector.multi_reduction #vector.kind, %arg0 [1] : vector<2x4xi32> to vector<2xi32> + return %0 : vector<2xi32> +} + +// CHECK-LABEL: func @vector_multi_reduction_or +// CHECK-SAME: %[[INPUT:.+]]: vector<2x4xi32> +// CHECK: %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [1, 0] : vector<2x4xi32> to vector<4x2xi32> +// CHECK: %[[V0:.+]] = vector.extract %[[TRANSPOSED]][0] : vector<4x2xi32> +// CHECK: %[[V1:.+]] = vector.extract %[[TRANSPOSED]][1] : vector<4x2xi32> +// CHECK: %[[RV01:.+]] = or %[[V1]], %[[V0]] : vector<2xi32> +// CHECK: %[[V2:.+]] = vector.extract %[[TRANSPOSED]][2] : vector<4x2xi32> +// CHECK: %[[RV012:.+]] = or %[[V2]], %[[RV01]] : vector<2xi32> +// CHECK: %[[V3:.+]] = vector.extract %[[TRANSPOSED]][3] : vector<4x2xi32> +// CHECK: %[[RESULT_VEC:.+]] = or %[[V3]], %[[RV012]] : vector<2xi32> +// CHECK: return %[[RESULT_VEC]] : vector<2xi32> + +func @vector_multi_reduction_xor(%arg0: vector<2x4xi32>) -> vector<2xi32> { + %0 = vector.multi_reduction #vector.kind, %arg0 [1] : vector<2x4xi32> to vector<2xi32> + return %0 : vector<2xi32> +} + +// CHECK-LABEL: func @vector_multi_reduction_xor +// CHECK-SAME: %[[INPUT:.+]]: vector<2x4xi32> +// CHECK: %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [1, 0] : vector<2x4xi32> to vector<4x2xi32> +// CHECK: %[[V0:.+]] = vector.extract %[[TRANSPOSED]][0] : vector<4x2xi32> +// CHECK: %[[V1:.+]] = vector.extract %[[TRANSPOSED]][1] : vector<4x2xi32> +// CHECK: %[[RV01:.+]] = xor %[[V1]], %[[V0]] : vector<2xi32> +// CHECK: %[[V2:.+]] = vector.extract %[[TRANSPOSED]][2] : vector<4x2xi32> +// CHECK: %[[RV012:.+]] = xor %[[V2]], %[[RV01]] : vector<2xi32> +// CHECK: %[[V3:.+]] = vector.extract %[[TRANSPOSED]][3] : vector<4x2xi32> +// CHECK: %[[RESULT_VEC:.+]] = xor %[[V3]], %[[RV012]] : vector<2xi32> +// CHECK: return %[[RESULT_VEC]] : vector<2xi32> + + +func @vector_reduction_outer(%arg0: vector<2x3x4x5xi32>) -> vector<2x3xi32> { + %0 = vector.multi_reduction #vector.kind, %arg0 [2, 3] : vector<2x3x4x5xi32> to vector<2x3xi32> + return %0 : vector<2x3xi32> +} + +// CHECK-LABEL: func @vector_reduction_outer +// CHECK-SAME: %[[INPUT:.+]]: vector<2x3x4x5xi32> +// CHECK: %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [2, 3, 0, 1] : vector<2x3x4x5xi32> to vector<4x5x2x3xi32> +// CHECK: %[[RESHAPED:.+]] = vector.shape_cast %[[TRANSPOSED]] : vector<4x5x2x3xi32> to vector<20x6xi32> +// CHECK: %[[V0:.+]] = vector.extract %[[RESHAPED]][0] : vector<20x6xi32> +// CHECK: %[[V1:.+]] = vector.extract %[[RESHAPED]][1] : vector<20x6xi32> +// CHECK: %[[R0:.+]] = addi %[[V1]], %[[V0]] : vector<6xi32> +// CHECK: %[[V2:.+]] = vector.extract %[[RESHAPED]][2] : vector<20x6xi32> +// CHECK: %[[R1:.+]] = addi %[[V2]], %[[R0]] : vector<6xi32> +// CHECK: %[[V3:.+]] = vector.extract %[[RESHAPED]][3] : vector<20x6xi32> +// CHECK: %[[R2:.+]] = addi %[[V3]], %[[R1]] : vector<6xi32> +// CHECK: %[[V4:.+]] = vector.extract %[[RESHAPED]][4] : vector<20x6xi32> +// CHECK: %[[R3:.+]] = addi %[[V4]], %[[R2]] : vector<6xi32> +// CHECK: %[[V5:.+]] = vector.extract %[[RESHAPED]][5] : vector<20x6xi32> +// CHECK: %[[R4:.+]] = addi %[[V5]], %[[R3]] : vector<6xi32> +// CHECK: %[[V6:.+]] = vector.extract %[[RESHAPED]][6] : vector<20x6xi32> +// CHECK: %[[R5:.+]] = addi %[[V6]], %[[R4]] : vector<6xi32> +// CHECK: %[[V7:.+]] = vector.extract %[[RESHAPED]][7] : vector<20x6xi32> +// CHECK: %[[R6:.+]] = addi %[[V7]], %[[R5]] : vector<6xi32> +// CHECK: %[[V8:.+]] = vector.extract %[[RESHAPED]][8] : vector<20x6xi32> +// CHECK: %[[R7:.+]] = addi %[[V8]], %[[R6]] : vector<6xi32> +// CHECK: %[[V9:.+]] = vector.extract %[[RESHAPED]][9] : vector<20x6xi32> +// CHECK: %[[R8:.+]] = addi %[[V9]], %[[R7]] : vector<6xi32> +// CHECK: %[[V10:.+]] = vector.extract %[[RESHAPED]][10] : vector<20x6xi32> +// CHECK: %[[R9:.+]] = addi %[[V10]], %[[R8]] : vector<6xi32> +// CHECK: %[[V11:.+]] = vector.extract %[[RESHAPED]][11] : vector<20x6xi32> +// CHECK: %[[R10:.+]] = addi %[[V11]], %[[R9]] : vector<6xi32> +// CHECK: %[[V12:.+]] = vector.extract %[[RESHAPED]][12] : vector<20x6xi32> +// CHECK: %[[R11:.+]] = addi %[[V12]], %[[R10]] : vector<6xi32> +// CHECK: %[[V13:.+]] = vector.extract %[[RESHAPED]][13] : vector<20x6xi32> +// CHECK: %[[R12:.+]] = addi %[[V13]], %[[R11]] : vector<6xi32> +// CHECK: %[[V14:.+]] = vector.extract %[[RESHAPED]][14] : vector<20x6xi32> +// CHECK: %[[R13:.+]] = addi %[[V14]], %[[R12]] : vector<6xi32> +// CHECK: %[[V15:.+]] = vector.extract %[[RESHAPED]][15] : vector<20x6xi32> +// CHECK: %[[R14:.+]] = addi %[[V15]], %[[R13]] : vector<6xi32> +// CHECK: %[[V16:.+]] = vector.extract %[[RESHAPED]][16] : vector<20x6xi32> +// CHECK: %[[R15:.+]] = addi %[[V16]], %[[R14]] : vector<6xi32> +// CHECK: %[[V17:.+]] = vector.extract %[[RESHAPED]][17] : vector<20x6xi32> +// CHECK: %[[R16:.+]] = addi %[[V17]], %[[R15]] : vector<6xi32> +// CHECK: %[[V18:.+]] = vector.extract %[[RESHAPED]][18] : vector<20x6xi32> +// CHECK: %[[R17:.+]] = addi %[[V18]], %[[R16]] : vector<6xi32> +// CHECK: %[[V19:.+]] = vector.extract %[[RESHAPED]][19] : vector<20x6xi32> +// CHECK: %[[R18:.+]] = addi %[[V19]], %[[R17]] : vector<6xi32> +// CHECK: %[[RESULT_VEC:.+]] = vector.shape_cast %[[R18]] : vector<6xi32> to vector<2x3xi32> +// CHECK: return %[[RESULT_VEC]] : vector<2x3xi32> diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp index 11b56a583cc83..907f9aedfdb17 100644 --- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp +++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp @@ -444,6 +444,9 @@ struct TestVectorTransferLoweringPatterns struct TestVectorMultiReductionLoweringPatterns : public PassWrapper { + TestVectorMultiReductionLoweringPatterns() = default; + TestVectorMultiReductionLoweringPatterns( + const TestVectorMultiReductionLoweringPatterns &pass) {} void getDependentDialects(DialectRegistry ®istry) const override { registry.insert(); } @@ -454,9 +457,13 @@ struct TestVectorMultiReductionLoweringPatterns return "Test conversion patterns to lower vector.multi_reduction to other " "vector ops"; } + Option useOuterReductions{ + *this, "use-outer-reductions", + llvm::cl::desc("Move reductions to outer most dimensions"), + llvm::cl::init(false)}; void runOnFunction() override { RewritePatternSet patterns(&getContext()); - populateVectorMultiReductionLoweringPatterns(patterns); + populateVectorMultiReductionLoweringPatterns(patterns, !useOuterReductions); (void)applyPatternsAndFoldGreedily(getFunction(), std::move(patterns)); } }; From f7e534c174a446f9ecc7b369dc75f1ad6da5e0ef Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 13 Aug 2021 17:03:22 -0400 Subject: [PATCH 012/700] [x86] add tests for fcmps with logic ops; NFC --- llvm/test/CodeGen/X86/fcmp-logic.ll | 151 ++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 llvm/test/CodeGen/X86/fcmp-logic.ll diff --git a/llvm/test/CodeGen/X86/fcmp-logic.ll b/llvm/test/CodeGen/X86/fcmp-logic.ll new file mode 100644 index 0000000000000..54f7183ef3337 --- /dev/null +++ b/llvm/test/CodeGen/X86/fcmp-logic.ll @@ -0,0 +1,151 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s + +define i1 @olt_ole_and_f32(float %w, float %x, float %y, float %z) { +; CHECK-LABEL: olt_ole_and_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: ucomiss %xmm0, %xmm1 +; CHECK-NEXT: seta %cl +; CHECK-NEXT: ucomiss %xmm2, %xmm3 +; CHECK-NEXT: setae %al +; CHECK-NEXT: andb %cl, %al +; CHECK-NEXT: retq + %f1 = fcmp olt float %w, %x + %f2 = fcmp ole float %y, %z + %r = and i1 %f1, %f2 + ret i1 %r +} + +define i1 @oge_oeq_or_f32(float %w, float %x, float %y, float %z) { +; CHECK-LABEL: oge_oeq_or_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: setae %cl +; CHECK-NEXT: ucomiss %xmm3, %xmm2 +; CHECK-NEXT: setnp %dl +; CHECK-NEXT: sete %al +; CHECK-NEXT: andb %dl, %al +; CHECK-NEXT: orb %cl, %al +; CHECK-NEXT: retq + %f1 = fcmp oge float %w, %x + %f2 = fcmp oeq float %y, %z + %r = or i1 %f1, %f2 + ret i1 %r +} + +define i1 @ord_one_xor_f32(float %w, float %x, float %y, float %z) { +; CHECK-LABEL: ord_one_xor_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: setnp %cl +; CHECK-NEXT: ucomiss %xmm3, %xmm2 +; CHECK-NEXT: setne %al +; CHECK-NEXT: xorb %cl, %al +; CHECK-NEXT: retq + %f1 = fcmp ord float %w, %x + %f2 = fcmp one float %y, %z + %r = xor i1 %f1, %f2 + ret i1 %r +} + +define i1 @une_ugt_and_f64(double %w, double %x, double %y, double %z) { +; CHECK-LABEL: une_ugt_and_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: setp %al +; CHECK-NEXT: setne %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: ucomisd %xmm2, %xmm3 +; CHECK-NEXT: setb %al +; CHECK-NEXT: andb %cl, %al +; CHECK-NEXT: retq + %f1 = fcmp une double %w, %x + %f2 = fcmp ugt double %y, %z + %r = and i1 %f1, %f2 + ret i1 %r +} + +define i1 @ult_uge_or_f64(double %w, double %x, double %y, double %z) { +; CHECK-LABEL: ult_uge_or_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: setb %cl +; CHECK-NEXT: ucomisd %xmm2, %xmm3 +; CHECK-NEXT: setbe %al +; CHECK-NEXT: orb %cl, %al +; CHECK-NEXT: retq + %f1 = fcmp ult double %w, %x + %f2 = fcmp uge double %y, %z + %r = or i1 %f1, %f2 + ret i1 %r +} + +define i1 @une_uno_xor_f64(double %w, double %x, double %y, double %z) { +; CHECK-LABEL: une_uno_xor_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: setp %al +; CHECK-NEXT: setne %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: ucomisd %xmm3, %xmm2 +; CHECK-NEXT: setp %al +; CHECK-NEXT: xorb %cl, %al +; CHECK-NEXT: retq + %f1 = fcmp une double %w, %x + %f2 = fcmp uno double %y, %z + %r = xor i1 %f1, %f2 + ret i1 %r +} + +define i1 @olt_olt_and_f32_f64(float %w, float %x, double %y, double %z) { +; CHECK-LABEL: olt_olt_and_f32_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: ucomiss %xmm0, %xmm1 +; CHECK-NEXT: seta %cl +; CHECK-NEXT: ucomisd %xmm2, %xmm3 +; CHECK-NEXT: seta %al +; CHECK-NEXT: andb %cl, %al +; CHECK-NEXT: retq + %f1 = fcmp olt float %w, %x + %f2 = fcmp olt double %y, %z + %r = and i1 %f1, %f2 + ret i1 %r +} + +define i1 @une_uno_xor_f64_use1(double %w, double %x, double %y, double %z, i1* %p) { +; CHECK-LABEL: une_uno_xor_f64_use1: +; CHECK: # %bb.0: +; CHECK-NEXT: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: setp %al +; CHECK-NEXT: setne %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: movb %cl, (%rdi) +; CHECK-NEXT: ucomisd %xmm3, %xmm2 +; CHECK-NEXT: setp %al +; CHECK-NEXT: xorb %cl, %al +; CHECK-NEXT: retq + %f1 = fcmp une double %w, %x + store i1 %f1, i1* %p + %f2 = fcmp uno double %y, %z + %r = xor i1 %f1, %f2 + ret i1 %r +} + +define i1 @une_uno_xor_f64_use2(double %w, double %x, double %y, double %z, i1* %p) { +; CHECK-LABEL: une_uno_xor_f64_use2: +; CHECK: # %bb.0: +; CHECK-NEXT: ucomisd %xmm1, %xmm0 +; CHECK-NEXT: setp %al +; CHECK-NEXT: setne %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: ucomisd %xmm3, %xmm2 +; CHECK-NEXT: setp %al +; CHECK-NEXT: setp (%rdi) +; CHECK-NEXT: xorb %cl, %al +; CHECK-NEXT: retq + %f1 = fcmp une double %w, %x + %f2 = fcmp uno double %y, %z + store i1 %f2, i1* %p + %r = xor i1 %f1, %f2 + ret i1 %r +} From 1f7b25ea76a925aca690da28de9d78db7ca99d0c Mon Sep 17 00:00:00 2001 From: Manoj Gupta Date: Fri, 13 Aug 2021 13:25:14 -0700 Subject: [PATCH 013/700] [lldb] skip host build for lldb_tblgen with LLDB_TABLEGEN_EXE set When cross compiling lldb-server, do not create a host build for building lldb-tblgeb when LLDB_TABLEGEN_EXE is already provided. This avoids an expensive and time-consuming build step if lldb-tblgen was already built previously for host. Reviewed By: JDevlieghere Differential Revision: https://reviews.llvm.org/D108053 --- lldb/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt index 2bb05c1e220b3..594c769141b43 100644 --- a/lldb/CMakeLists.txt +++ b/lldb/CMakeLists.txt @@ -64,7 +64,7 @@ if(LLVM_ENABLE_MODULES) list(APPEND LLVM_COMMON_DEPENDS intrinsics_gen) endif() -if(CMAKE_CROSSCOMPILING AND LLDB_BUILT_STANDALONE) +if(CMAKE_CROSSCOMPILING AND LLDB_BUILT_STANDALONE AND NOT LLDB_TABLEGEN_EXE) set(LLVM_USE_HOST_TOOLS ON) include(CrossCompile) if (NOT NATIVE_LLVM_DIR OR NOT NATIVE_Clang_DIR) From 9ed07781791054f9e129c892adf819b55d6e6c44 Mon Sep 17 00:00:00 2001 From: zoecarver Date: Fri, 13 Aug 2021 11:36:55 -0700 Subject: [PATCH 014/700] [libcxx][ranges] Move `namespace views` into `namespace ranges` and add an alias. Differential Revision: https://reviews.llvm.org/D108047 --- libcxx/include/__ranges/all.h | 4 ++-- libcxx/include/__ranges/counted.h | 4 ++-- libcxx/include/__ranges/iota_view.h | 2 +- libcxx/include/ranges | 2 ++ libcxx/test/std/ranges/range.adaptors/range.all/all.pass.cpp | 4 ++++ .../std/ranges/range.adaptors/range.counted/counted.pass.cpp | 4 ++++ .../range.factories/range.iota.view/views_iota.pass.cpp | 3 +++ 7 files changed, 18 insertions(+), 5 deletions(-) diff --git a/libcxx/include/__ranges/all.h b/libcxx/include/__ranges/all.h index d678d3e5d357c..f44beaa272722 100644 --- a/libcxx/include/__ranges/all.h +++ b/libcxx/include/__ranges/all.h @@ -32,7 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if !defined(_LIBCPP_HAS_NO_RANGES) -namespace views { +namespace ranges::views { namespace __all { struct __fn { @@ -75,7 +75,7 @@ inline namespace __cpo { template using all_t = decltype(views::all(declval<_Range>())); -} // namespace views +} // namespace ranges::views #endif // !defined(_LIBCPP_HAS_NO_RANGES) diff --git a/libcxx/include/__ranges/counted.h b/libcxx/include/__ranges/counted.h index d891c1f4efac4..d7240803608f7 100644 --- a/libcxx/include/__ranges/counted.h +++ b/libcxx/include/__ranges/counted.h @@ -36,7 +36,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if !defined(_LIBCPP_HAS_NO_RANGES) -namespace views { +namespace ranges::views { namespace __counted { template @@ -88,7 +88,7 @@ inline namespace __cpo { inline constexpr auto counted = __counted::__fn{}; } // namespace __cpo -} // namespace views +} // namespace ranges::views #endif // !defined(_LIBCPP_HAS_NO_RANGES) diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h index f302826b02474..047d8460c268b 100644 --- a/libcxx/include/__ranges/iota_view.h +++ b/libcxx/include/__ranges/iota_view.h @@ -366,7 +366,6 @@ namespace ranges { template inline constexpr bool enable_borrowed_range> = true; -} // namespace ranges namespace views { namespace __iota { @@ -395,6 +394,7 @@ inline namespace __cpo { inline constexpr auto iota = __iota::__fn{}; } } // namespace views +} // namespace ranges #endif // !defined(_LIBCPP_HAS_NO_RANGES) diff --git a/libcxx/include/ranges b/libcxx/include/ranges index 014260aaee15b..a4cbfafd5f99e 100644 --- a/libcxx/include/ranges +++ b/libcxx/include/ranges @@ -238,6 +238,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER > 17 && !defined(_LIBCPP_HAS_NO_RANGES) +namespace views = ranges::views; + #endif // _LIBCPP_STD_VER > 17 && !defined(_LIBCPP_HAS_NO_RANGES) _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/test/std/ranges/range.adaptors/range.all/all.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.all/all.pass.cpp index 2aeb94f8bd4cb..cef873465eb65 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.all/all.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.all/all.pass.cpp @@ -142,6 +142,10 @@ constexpr bool test() { assert(std::ranges::end(subrange) == std::ranges::begin(subrange) + 8); } + { + static_assert(std::same_as); + } + return true; } diff --git a/libcxx/test/std/ranges/range.adaptors/range.counted/counted.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.counted/counted.pass.cpp index 62791bbdd28a0..2ebd7036db33a 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.counted/counted.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.counted/counted.pass.cpp @@ -195,6 +195,10 @@ constexpr bool test() { } } + { + static_assert(std::same_as); + } + return true; } diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp index 661285585095c..6fcb4abe21dec 100644 --- a/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp +++ b/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp @@ -71,6 +71,9 @@ constexpr bool test() { static_assert( std::is_invocable_v); static_assert(!std::is_invocable_v); } + { + static_assert(std::same_as); + } return true; } From f80ae580686b99b69224c38ec4db6c982c814fe3 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 13 Aug 2021 14:16:44 -0700 Subject: [PATCH 015/700] [NFC] Cleanup calls to AttributeList::getAttribute(FunctionIndex) getAttribute() is confusing, use a clearer method. --- llvm/include/llvm/IR/Attributes.h | 10 ++++++++++ llvm/include/llvm/IR/Function.h | 4 ++-- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 4 +--- llvm/lib/IR/Verifier.cpp | 6 ++---- llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp | 3 +-- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h index 08220b1d6413b..09863187c6115 100644 --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -691,6 +691,16 @@ class AttributeList { return getAttribute(ArgNo + FirstArgIndex, Kind); } + /// Return the attribute object that exists for the function. + Attribute getFnAttr(Attribute::AttrKind Kind) const { + return getAttribute(FunctionIndex, Kind); + } + + /// Return the attribute object that exists for the function. + Attribute getFnAttr(StringRef Kind) const { + return getAttribute(FunctionIndex, Kind); + } + /// Return the alignment of the return value. MaybeAlign getRetAlignment() const; diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index ce0bbb54d846f..af891cd67f8ae 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -363,12 +363,12 @@ class Function : public GlobalObject, public ilist_node { /// Return the attribute for the given attribute kind. Attribute getFnAttribute(Attribute::AttrKind Kind) const { - return getAttribute(AttributeList::FunctionIndex, Kind); + return AttributeSets.getFnAttr(Kind); } /// Return the attribute for the given attribute kind. Attribute getFnAttribute(StringRef Kind) const { - return getAttribute(AttributeList::FunctionIndex, Kind); + return AttributeSets.getFnAttr(Kind); } /// Return the stack alignment for the function. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 9e21a213a77aa..aebd89e93329e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6735,9 +6735,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::debugtrap: case Intrinsic::trap: { StringRef TrapFuncName = - I.getAttributes() - .getAttribute(AttributeList::FunctionIndex, "trap-func-name") - .getValueAsString(); + I.getAttributes().getFnAttr("trap-func-name").getValueAsString(); if (TrapFuncName.empty()) { switch (Intrinsic) { case Intrinsic::trap: diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index e7857b93333f7..5e93aa08c5af3 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -1825,8 +1825,7 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty, void Verifier::checkUnsignedBaseTenFuncAttr(AttributeList Attrs, StringRef Attr, const Value *V) { if (Attrs.hasFnAttr(Attr)) { - StringRef S = Attrs.getAttribute(AttributeList::FunctionIndex, Attr) - .getValueAsString(); + StringRef S = Attrs.getFnAttr(Attr).getValueAsString(); unsigned N; if (S.getAsInteger(10, N)) CheckFailed("\"" + Attr + "\" takes an unsigned integer: " + S, V); @@ -2018,8 +2017,7 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, } if (Attrs.hasFnAttr("frame-pointer")) { - StringRef FP = Attrs.getAttribute(AttributeList::FunctionIndex, - "frame-pointer").getValueAsString(); + StringRef FP = Attrs.getFnAttr("frame-pointer").getValueAsString(); if (FP != "all" && FP != "non-leaf" && FP != "none") CheckFailed("invalid value for 'frame-pointer' attribute: " + FP, V); } diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 77701bf1d0c65..76b088cb5519f 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1534,8 +1534,7 @@ static StringRef getDeoptLowering(CallBase *Call) { // with values. const AttributeList &CSAS = Call->getAttributes(); if (CSAS.hasFnAttr(DeoptLowering)) - return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering) - .getValueAsString(); + return CSAS.getFnAttr(DeoptLowering).getValueAsString(); Function *F = Call->getCalledFunction(); assert(F && F->hasFnAttribute(DeoptLowering)); return F->getFnAttribute(DeoptLowering).getValueAsString(); From dc41c558dd907a676fb390d3a05a93bda960e3c0 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 13 Aug 2021 14:35:48 -0700 Subject: [PATCH 016/700] [NFC] Make AttributeList::hasAttribute(AttributeList::ReturnIndex) its own method AttributeList::hasAttribute() is confusing. In an attempt to change the name to something that suggests using other methods, fix up some existing uses. --- llvm/include/llvm/IR/Attributes.h | 5 +++++ llvm/include/llvm/IR/Function.h | 5 +++++ llvm/include/llvm/IR/InstrTypes.h | 4 ++-- llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp | 2 +- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h index 09863187c6115..734279433c933 100644 --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -657,6 +657,11 @@ class AttributeList { return hasAttribute(ReturnIndex, Kind); } + /// Return true if the attribute exists for the return value. + bool hasRetAttr(StringRef Kind) const { + return hasAttribute(ReturnIndex, Kind); + } + /// Return true if attributes exist for the return value. bool hasRetAttrs() const { return hasAttributes(ReturnIndex); } diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index af891cd67f8ae..4c3758ebdb090 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -455,6 +455,11 @@ class Function : public GlobalObject, public ilist_node { return getAttributes().getParamAttr(ArgNo, Kind); } + /// check if an attribute is in the list of attributes for the return value. + bool hasRetAttribute(Attribute::AttrKind Kind) const { + return getAttributes().hasRetAttr(Kind); + } + /// gets the attribute from the list of attributes. Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const { return AttributeSets.getAttribute(i, Kind); diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index 65cada3d735de..142e1f29a641c 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -2272,12 +2272,12 @@ class CallBase : public Instruction { /// Determine whether the return value has the given attribute. Supports /// Attribute::AttrKind and StringRef as \p AttrKind types. template bool hasRetAttrImpl(AttrKind Kind) const { - if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind)) + if (Attrs.hasRetAttr(Kind)) return true; // Look at the callee, if available. if (const Function *F = getCalledFunction()) - return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind); + return F->getAttributes().hasRetAttr(Kind); return false; } }; diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 385a9ec7457da..a727b76162d47 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3807,7 +3807,7 @@ struct MemorySanitizerVisitor : public InstVisitor { if (isAMustTailRetVal(RetVal)) return; Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB); bool HasNoUndef = - F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoUndef); + F.hasRetAttribute(Attribute::NoUndef); bool StoreShadow = !(ClEagerChecks && HasNoUndef); // FIXME: Consider using SpecialCaseList to specify a list of functions that // must always return fully initialized values. For now, we hardcode "main". From d5ff5ef65e1b66adf01e3c647c76578821338030 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 13 Aug 2021 16:49:05 -0700 Subject: [PATCH 017/700] [NFC] One more AttributeList::getAttribute(FunctionIndex) -> getFnAttr() --- llvm/lib/IR/Statepoint.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/IR/Statepoint.cpp b/llvm/lib/IR/Statepoint.cpp index bbfbbe489bae7..b5916e4937c6e 100644 --- a/llvm/lib/IR/Statepoint.cpp +++ b/llvm/lib/IR/Statepoint.cpp @@ -26,16 +26,14 @@ StatepointDirectives llvm::parseStatepointDirectivesFromAttrs(AttributeList AS) { StatepointDirectives Result; - Attribute AttrID = - AS.getAttribute(AttributeList::FunctionIndex, "statepoint-id"); + Attribute AttrID = AS.getFnAttr("statepoint-id"); uint64_t StatepointID; if (AttrID.isStringAttribute()) if (!AttrID.getValueAsString().getAsInteger(10, StatepointID)) Result.StatepointID = StatepointID; uint32_t NumPatchBytes; - Attribute AttrNumPatchBytes = AS.getAttribute(AttributeList::FunctionIndex, - "statepoint-num-patch-bytes"); + Attribute AttrNumPatchBytes = AS.getFnAttr("statepoint-num-patch-bytes"); if (AttrNumPatchBytes.isStringAttribute()) if (!AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes)) Result.NumPatchBytes = NumPatchBytes; From c19d7f8af0321d1b90343496ebd1c1aec3f1dc8c Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Sat, 7 Aug 2021 00:28:19 -0700 Subject: [PATCH 018/700] [CallPromotion] Check for inalloca/byval mismatch Previously we would allow promotion even if the byval/inalloca attributes on the call and the callee didn't match. It's ok if the byval/inalloca types aren't the same. For example, LTO importing may rename types. Fixes PR51397. Reviewed By: rnk Differential Revision: https://reviews.llvm.org/D107998 --- .../Transforms/Utils/CallPromotionUtils.cpp | 20 +++++++++++++++-- .../Transforms/PGOProfile/mismatched-byval.ll | 22 +++++++++++++++++++ .../PGOProfile/mismatched-inalloca.ll | 21 ++++++++++++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/PGOProfile/mismatched-byval.ll create mode 100644 llvm/test/Transforms/PGOProfile/mismatched-inalloca.ll diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp index 33cb8823086e0..ebe19f1751e55 100644 --- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -424,6 +424,21 @@ bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee, *FailureReason = "Argument type mismatch"; return false; } + // Make sure that the callee and call agree on byval/inalloca. The types do + // not have to match. + + if (Callee->hasParamAttribute(I, Attribute::ByVal) != + CB.getAttributes().hasParamAttr(I, Attribute::ByVal)) { + if (FailureReason) + *FailureReason = "byval mismatch"; + return false; + } + if (Callee->hasParamAttribute(I, Attribute::InAlloca) != + CB.getAttributes().hasParamAttr(I, Attribute::InAlloca)) { + if (FailureReason) + *FailureReason = "inalloca mismatch"; + return false; + } } for (; I < NumArgs; I++) { // Vararg functions can have more arguments than parameters. @@ -488,10 +503,11 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee, AttrBuilder ArgAttrs(CallerPAL.getParamAttrs(ArgNo)); ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy)); - // If byval is used, this must be a pointer type, and the byval type must - // match the element type. Update it if present. + // We may have a different byval/inalloca type. if (ArgAttrs.getByValType()) ArgAttrs.addByValAttr(Callee->getParamByValType(ArgNo)); + if (ArgAttrs.getInAllocaType()) + ArgAttrs.addInAllocaAttr(Callee->getParamInAllocaType(ArgNo)); NewArgAttrs.push_back(AttributeSet::get(Ctx, ArgAttrs)); AttributeChanged = true; diff --git a/llvm/test/Transforms/PGOProfile/mismatched-byval.ll b/llvm/test/Transforms/PGOProfile/mismatched-byval.ll new file mode 100644 index 0000000000000..84c335c678473 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/mismatched-byval.ll @@ -0,0 +1,22 @@ +; RUN: opt -passes=pgo-icall-prom -profile-summary-hot-count=10 -S < %s -pass-remarks-output=- | FileCheck %s + +; CHECK: byval mismatch + +define void @a(i8* %0) !prof !0 { + ret void +} + +define void @b(void (i64*)** %v, i64* %p) !prof !1 { +; CHECK-LABEL: @b +; CHECK-NEXT: load +; CHECK-NEXT: call void {{.*}}(i64* byval(i64) +; CHECK-NEXT: ret void +; + %a = load void (i64*)*, void (i64*)** %v + call void %a(i64* byval(i64) %p), !prof !2 + ret void +} + +!0 = !{!"function_entry_count", i64 36} +!1 = !{!"function_entry_count", i64 1} +!2 = !{!"VP", i32 0, i64 18, i64 12157170054180749580, i64 18} diff --git a/llvm/test/Transforms/PGOProfile/mismatched-inalloca.ll b/llvm/test/Transforms/PGOProfile/mismatched-inalloca.ll new file mode 100644 index 0000000000000..5ac174cd7c733 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/mismatched-inalloca.ll @@ -0,0 +1,21 @@ +; RUN: opt -passes=pgo-icall-prom -profile-summary-hot-count=10 -S < %s -pass-remarks-output=- | FileCheck %s + +; CHECK: inalloca mismatch + +define void @a(i8* %0) !prof !0 { + ret void +} + +define void @b(void (i64*)** %v, i64* %p) !prof !1 { +; CHECK-LABEL: @b +; CHECK-NEXT: load +; CHECK-NEXT: call void {{.*}}(i64* inalloca(i64) +; CHECK-NEXT: ret void + %a = load void (i64*)*, void (i64*)** %v + call void %a(i64* inalloca(i64) %p), !prof !2 + ret void +} + +!0 = !{!"function_entry_count", i64 36} +!1 = !{!"function_entry_count", i64 1} +!2 = !{!"VP", i32 0, i64 18, i64 12157170054180749580, i64 18} From 16e8134e7c770849a6a1b9c79fdd548a0ef99190 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 13 Aug 2021 16:56:42 -0700 Subject: [PATCH 019/700] [NFC] One more AttributeList::getAttribute(FunctionIndex) -> getFnAttr() --- llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 574524410a916..a476d3ed0909e 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -238,9 +238,9 @@ const HexagonSubtarget * HexagonTargetMachine::getSubtargetImpl(const Function &F) const { AttributeList FnAttrs = F.getAttributes(); Attribute CPUAttr = - FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-cpu"); + FnAttrs.getFnAttr("target-cpu"); Attribute FSAttr = - FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-features"); + FnAttrs.getFnAttr("target-features"); std::string CPU = CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU; From 5beb9a0e6aec4a4901229377d8cb9e6115956446 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 13 Aug 2021 09:20:17 -0400 Subject: [PATCH 020/700] AMDGPU: Respect compute ABI attributes with unknown OS Unfortunately Mesa is still using amdgcn-- as the triple for OpenGL, so we still have the awkward unknown OS case to deal with. Previously if the HSA ABI intrinsics appeared, we we would not add the ABI registers to the function. We would emit an error later, but we still need to produce some compile result. Start adding the registers to any compute function, regardless of the OS. This keeps the internal state more consistent, and will help avoid numerous test crashes in a future patch which starts assuming the ABI inputs are present on functions by default. --- .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 16 ++++++++-------- llvm/test/CodeGen/AMDGPU/trap-abis.ll | 18 +++++++++--------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 85cfe36df16aa..76bec48d77f19 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -151,10 +151,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) } bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); - if (isAmdHsaOrMesa) { - if (!ST.enableFlatScratch()) - PrivateSegmentBuffer = true; + if (isAmdHsaOrMesa && !ST.enableFlatScratch()) + PrivateSegmentBuffer = true; + else if (ST.isMesaGfxShader(F)) + ImplicitBufferPtr = true; + + if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr")) + KernargSegmentPtr = true; + if (!AMDGPU::isGraphics(CC)) { if (UseFixedABI) { DispatchPtr = true; QueuePtr = true; @@ -171,13 +176,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F.hasFnAttribute("amdgpu-dispatch-id")) DispatchID = true; } - } else if (ST.isMesaGfxShader(F)) { - ImplicitBufferPtr = true; } - if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr")) - KernargSegmentPtr = true; - // TODO: This could be refined a lot. The attribute is a poor way of // detecting calls or stack objects that may require it before argument // lowering. diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index 4fe18cb5608a7..d3f6b7c9539db 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -18,7 +18,7 @@ declare void @llvm.debugtrap() #1 define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-V2-LABEL: trap: ; NOHSA-TRAP-GFX900-V2: ; %bb.0: -; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V2-NEXT: s_waitcnt lgkmcnt(0) @@ -28,7 +28,7 @@ define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) { ; ; NOHSA-TRAP-GFX900-V3-LABEL: trap: ; NOHSA-TRAP-GFX900-V3: ; %bb.0: -; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0) @@ -38,7 +38,7 @@ define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) { ; ; NOHSA-TRAP-GFX900-V4-LABEL: trap: ; NOHSA-TRAP-GFX900-V4: ; %bb.0: -; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) @@ -357,7 +357,7 @@ define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) { define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %arg0) local_unnamed_addr { ; NOHSA-TRAP-GFX900-V2-LABEL: non_entry_trap: ; NOHSA-TRAP-GFX900-V2: ; %bb.0: ; %entry -; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V2-NEXT: s_waitcnt lgkmcnt(0) ; NOHSA-TRAP-GFX900-V2-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -375,7 +375,7 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly % ; ; NOHSA-TRAP-GFX900-V3-LABEL: non_entry_trap: ; NOHSA-TRAP-GFX900-V3: ; %bb.0: ; %entry -; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V3-NEXT: s_waitcnt lgkmcnt(0) ; NOHSA-TRAP-GFX900-V3-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -393,7 +393,7 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly % ; ; NOHSA-TRAP-GFX900-V4-LABEL: non_entry_trap: ; NOHSA-TRAP-GFX900-V4: ; %bb.0: ; %entry -; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) ; NOHSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -805,7 +805,7 @@ ret: define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-V2-LABEL: debugtrap: ; NOHSA-TRAP-GFX900-V2: ; %bb.0: -; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v2, 2 @@ -818,7 +818,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0) ; ; NOHSA-TRAP-GFX900-V3-LABEL: debugtrap: ; NOHSA-TRAP-GFX900-V3: ; %bb.0: -; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2 @@ -831,7 +831,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0) ; ; NOHSA-TRAP-GFX900-V4-LABEL: debugtrap: ; NOHSA-TRAP-GFX900-V4: ; %bb.0: -; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2 From 152ceec1ae8f5c1af2970210a96d1d4a952aa2ff Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 11 Aug 2021 19:01:30 -0400 Subject: [PATCH 021/700] AMDGPU: Add indirect and extern calls to attributor test --- .../annotate-kernel-features-hsa-call.ll | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index 06d4f140538a0..0c25ff94427d0 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -738,6 +738,82 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 { ret float %fadd } +define float @func_indirect_call(float()* %fptr) #3 { +; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_call +; AKF_HSA-SAME: (float ()* [[FPTR:%.*]]) #[[ATTR21:[0-9]+]] { +; AKF_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() +; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 +; AKF_HSA-NEXT: ret float [[FADD]] +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call +; ATTRIBUTOR_HSA-SAME: (float ()* [[FPTR:%.*]]) #[[ATTR6]] { +; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() +; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 +; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] +; + %f = call float %fptr() + %fadd = fadd float %f, 1.0 + ret float %fadd +} + +declare float @extern() #3 +define float @func_extern_call() #3 { +; AKF_HSA-LABEL: define {{[^@]+}}@func_extern_call +; AKF_HSA-SAME: () #[[ATTR17]] { +; AKF_HSA-NEXT: [[F:%.*]] = call float @extern() +; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 +; AKF_HSA-NEXT: ret float [[FADD]] +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call +; ATTRIBUTOR_HSA-SAME: () #[[ATTR6]] { +; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @extern() #[[ATTR10]] +; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 +; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] +; + %f = call float @extern() + %fadd = fadd float %f, 1.0 + ret float %fadd +} + +define float @func_null_call(float()* %fptr) #3 { +; AKF_HSA-LABEL: define {{[^@]+}}@func_null_call +; AKF_HSA-SAME: (float ()* [[FPTR:%.*]]) #[[ATTR21]] { +; AKF_HSA-NEXT: [[F:%.*]] = call float null() +; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 +; AKF_HSA-NEXT: ret float [[FADD]] +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call +; ATTRIBUTOR_HSA-SAME: (float ()* [[FPTR:%.*]]) #[[ATTR6]] { +; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float null() +; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 +; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] +; + %f = call float null() + %fadd = fadd float %f, 1.0 + ret float %fadd +} + +declare float @llvm.amdgcn.rcp.f32(float) #0 + +; Calls some other recognized intrinsic +define float @func_other_intrinsic_call(float %arg) #3 { +; AKF_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call +; AKF_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR18]] { +; AKF_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]]) +; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 +; AKF_HSA-NEXT: ret float [[FADD]] +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call +; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR7]] { +; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]]) +; ATTRIBUTOR_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 +; ATTRIBUTOR_HSA-NEXT: ret float [[FADD]] +; + %f = call float @llvm.amdgcn.rcp.f32(float %arg) + %fadd = fadd float %f, 1.0 + ret float %fadd +} + attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind "target-cpu"="fiji" } attributes #2 = { nounwind "target-cpu"="gfx900" } @@ -765,6 +841,7 @@ attributes #3 = { nounwind } ; AKF_HSA: attributes #[[ATTR18]] = { nounwind } ; AKF_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-calls" "uniform-work-group-size"="false" } ; AKF_HSA: attributes #[[ATTR20]] = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "target-cpu"="fiji" } +; AKF_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" } From a77ae4aa6a351c916876ac83529c8122607599ad Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 12 Aug 2021 15:19:54 -0400 Subject: [PATCH 022/700] AMDGPU: Stop attributor adding attributes to intrinsic declarations --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 18 ++++++++---------- .../AMDGPU/addrspacecast-constantexpr.ll | 2 +- .../annotate-kernel-features-hsa-call.ll | 2 +- .../AMDGPU/annotate-kernel-features-hsa.ll | 2 +- .../CodeGen/AMDGPU/annotate-kernel-features.ll | 2 +- .../CodeGen/AMDGPU/pal-simple-indirect-call.ll | 2 +- 6 files changed, 13 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 61b1d22edc330..0c24903490f0f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -302,12 +302,6 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { CallingConv::ID CC = F->getCallingConv(); bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); - // Don't add attributes to instrinsics - if (F->isIntrinsic()) { - indicatePessimisticFixpoint(); - return; - } - // Ignore functions with graphics calling conventions, these are currently // not allowed to have kernel arguments. if (AMDGPU::isGraphics(F->getCallingConv())) { @@ -500,8 +494,10 @@ class AMDGPUAttributor : public ModulePass { bool runOnModule(Module &M) override { SetVector Functions; AnalysisGetter AG; - for (Function &F : M) - Functions.insert(&F); + for (Function &F : M) { + if (!F.isIntrinsic()) + Functions.insert(&F); + } CallGraphUpdater CGUpdater; BumpPtrAllocator Allocator; @@ -509,8 +505,10 @@ class AMDGPUAttributor : public ModulePass { Attributor A(Functions, InfoCache, CGUpdater); for (Function &F : M) { - A.getOrCreateAAFor(IRPosition::function(F)); - A.getOrCreateAAFor(IRPosition::function(F)); + if (!F.isIntrinsic()) { + A.getOrCreateAAFor(IRPosition::function(F)); + A.getOrCreateAAFor(IRPosition::function(F)); + } } ChangeStatus Change = A.run(); diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll index d235753453ad2..e46bdd9aaf0bc 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -172,7 +172,7 @@ attributes #1 = { nounwind } ; AKF_HSA: attributes #[[ATTR1]] = { nounwind } ; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-queue-ptr" } ;. -; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { argmemonly nofree nounwind willreturn "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { argmemonly nofree nounwind willreturn } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-queue-ptr" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index 0c25ff94427d0..d5d87289f4b9e 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -843,7 +843,7 @@ attributes #3 = { nounwind } ; AKF_HSA: attributes #[[ATTR20]] = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "target-cpu"="fiji" } ; AKF_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } ;. -; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { noreturn nounwind readnone "target-cpu"="fiji" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { noreturn nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index 15225677da7c3..99fab98422f71 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -520,7 +520,7 @@ attributes #1 = { nounwind } ; AKF_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-kernarg-segment-ptr" } ; AKF_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-stack-objects" } ;. -; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-work-group-id-y" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-work-group-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll index df765fb2d4899..9bbdb7365c2d3 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -325,7 +325,7 @@ attributes #1 = { nounwind } ; AKF_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } ; AKF_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } ;. -; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn } ; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "uniform-work-group-size"="false" } ; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-work-group-id-y" "uniform-work-group-size"="false" } ; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-work-group-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll index a92f0e2d48362..f890ab2625f1b 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll @@ -71,5 +71,5 @@ attributes #0 = { nounwind readnone speculatable willreturn } ; AKF_GCN: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1:[0-9]+]] = { nounwind readnone speculatable willreturn "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1:[0-9]+]] = { nounwind readnone speculatable willreturn } ;. From d63f117210d1e189857e49320c15f486e453696a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 13 Aug 2021 17:39:52 -0700 Subject: [PATCH 023/700] [RISCV] Support RISCVISD::SELECT_CC in ComputeNumSignBitsForTargetNode. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +++ llvm/test/CodeGen/RISCV/select-cc.ll | 57 ++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 0c92a1f54d4d6..8da71a1695a0e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -6723,6 +6723,12 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( switch (Op.getOpcode()) { default: break; + case RISCVISD::SELECT_CC: { + unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth + 1); + if (Tmp == 1) return 1; // Early out. + unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(4), DemandedElts, Depth + 1); + return std::min(Tmp, Tmp2); + } case RISCVISD::SLLW: case RISCVISD::SRAW: case RISCVISD::SRLW: diff --git a/llvm/test/CodeGen/RISCV/select-cc.ll b/llvm/test/CodeGen/RISCV/select-cc.ll index edc3bf0261f7a..bffba3aaa81bf 100644 --- a/llvm/test/CodeGen/RISCV/select-cc.ll +++ b/llvm/test/CodeGen/RISCV/select-cc.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zbt -disable-block-placement -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV32IBT %s -define i32 @foo(i32 %a, i32 *%b) nounwind { +define signext i32 @foo(i32 signext %a, i32 *%b) nounwind { ; RV32I-LABEL: foo: ; RV32I: # %bb.0: ; RV32I-NEXT: lw a2, 0(a1) @@ -159,3 +159,58 @@ define i32 @foo(i32 %a, i32 *%b) nounwind { ret i32 %val24 } + +; Test that we can ComputeNumSignBits across basic blocks when the live out is +; RISCVISD::SELECT_CC. There should be no slli+srai or sext.h in the output. +define signext i16 @numsignbits(i16 signext %0, i16 signext %1, i16 signext %2, i16 signext %3) nounwind { +; RV32I-LABEL: numsignbits: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s0, a3 +; RV32I-NEXT: beqz a0, .LBB1_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: .LBB1_2: +; RV32I-NEXT: beqz a1, .LBB1_4 +; RV32I-NEXT: # %bb.3: +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call bar@plt +; RV32I-NEXT: .LBB1_4: +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; +; RV32IBT-LABEL: numsignbits: +; RV32IBT: # %bb.0: +; RV32IBT-NEXT: addi sp, sp, -16 +; RV32IBT-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IBT-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32IBT-NEXT: cmov s0, a0, a2, a3 +; RV32IBT-NEXT: beqz a1, .LBB1_2 +; RV32IBT-NEXT: # %bb.1: +; RV32IBT-NEXT: mv a0, s0 +; RV32IBT-NEXT: call bar@plt +; RV32IBT-NEXT: .LBB1_2: +; RV32IBT-NEXT: mv a0, s0 +; RV32IBT-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32IBT-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IBT-NEXT: addi sp, sp, 16 +; RV32IBT-NEXT: ret + %5 = icmp eq i16 %0, 0 + %6 = select i1 %5, i16 %3, i16 %2 + %7 = icmp eq i16 %1, 0 + br i1 %7, label %9, label %8 + +8: ; preds = %4 + tail call void @bar(i16 signext %6) + br label %9 + +9: ; preds = %8, %4 + ret i16 %6 +} + +declare void @bar(i16 signext) From cc56152f83b41047998c1b7e0a6d8bfe8671f605 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 31 Jul 2021 12:05:33 -0400 Subject: [PATCH 024/700] GlobalISel: Add helper function for getting EVT from LLT This can only give an imperfect approximation, but is enough to avoid crashing in places where we call into EVT functions starting from LLTs. --- llvm/include/llvm/CodeGen/LowLevelType.h | 3 ++- llvm/lib/CodeGen/LowLevelType.cpp | 10 ++++++++++ llvm/lib/CodeGen/TargetLoweringBase.cpp | 5 +++-- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/CodeGen/LowLevelType.h b/llvm/include/llvm/CodeGen/LowLevelType.h index 40985e16b37a7..922f93d2e5985 100644 --- a/llvm/include/llvm/CodeGen/LowLevelType.h +++ b/llvm/include/llvm/CodeGen/LowLevelType.h @@ -16,8 +16,8 @@ #ifndef LLVM_CODEGEN_LOWLEVELTYPE_H #define LLVM_CODEGEN_LOWLEVELTYPE_H +#include "llvm/CodeGen/ValueTypes.h" #include "llvm/Support/LowLevelTypeImpl.h" -#include "llvm/Support/MachineValueType.h" namespace llvm { @@ -31,6 +31,7 @@ LLT getLLTForType(Type &Ty, const DataLayout &DL); /// Get a rough equivalent of an MVT for a given LLT. MVT can't distinguish /// pointers, so these will convert to a plain integer. MVT getMVTForLLT(LLT Ty); +EVT getApproximateEVTForLLT(LLT Ty, const DataLayout &DL, LLVMContext &Ctx); /// Get a rough equivalent of an LLT for a given MVT. LLT does not yet support /// scalarable vector types, and will assert if used. diff --git a/llvm/lib/CodeGen/LowLevelType.cpp b/llvm/lib/CodeGen/LowLevelType.cpp index 62e9c6b629d3b..dce64ab9f5ca4 100644 --- a/llvm/lib/CodeGen/LowLevelType.cpp +++ b/llvm/lib/CodeGen/LowLevelType.cpp @@ -52,6 +52,16 @@ MVT llvm::getMVTForLLT(LLT Ty) { Ty.getNumElements()); } +EVT llvm::getApproximateEVTForLLT(LLT Ty, const DataLayout &DL, + LLVMContext &Ctx) { + if (Ty.isVector()) { + EVT EltVT = getApproximateEVTForLLT(Ty.getElementType(), DL, Ctx); + return EVT::getVectorVT(Ctx, EltVT, Ty.getElementCount()); + } + + return EVT::getIntegerVT(Ctx, Ty.getSizeInBits()); +} + LLT llvm::getLLTForMVT(MVT Ty) { if (!Ty.isVector()) return LLT::scalar(Ty.getSizeInBits()); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 0b817c99d20a5..74946c09fad96 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1749,8 +1749,9 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, LLT Ty, const MachineMemOperand &MMO, bool *Fast) const { - return allowsMemoryAccess(Context, DL, getMVTForLLT(Ty), MMO.getAddrSpace(), - MMO.getAlign(), MMO.getFlags(), Fast); + EVT VT = getApproximateEVTForLLT(Ty, DL, Context); + return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(), MMO.getAlign(), + MMO.getFlags(), Fast); } //===----------------------------------------------------------------------===// From 50efbf9cbeed9d202e9f66a0d154489811918944 Mon Sep 17 00:00:00 2001 From: Jessica Paquette Date: Wed, 11 Aug 2021 13:20:33 -0700 Subject: [PATCH 025/700] [GlobalISel] Narrow binops feeding into G_AND with a mask This is a fairly common pattern: ``` %mask = G_CONSTANT iN %add = G_ADD %lhs, %rhs %and = G_AND %add, %mask ``` We have combines to eliminate G_AND with a mask that does nothing. If we combined the above to this: ``` %mask = G_CONSTANT iN %narrow_lhs = G_TRUNC %lhs %narrow_rhs = G_TRUNC %rhs %narrow_add = G_ADD %narrow_lhs, %narrow_rhs %ext = G_ZEXT %narrow_add %and = G_AND %ext, %mask ``` We'd be able to take advantage of those combines using the trunc + zext. For this to work (or be beneficial in the best case) - The operation we want to narrow then widen must only be used by the G_AND - The G_TRUNC + G_ZEXT must be free - Performing the operation at a narrower width must not produce a different value than performing it at the original width *after masking.* Example comparison between SDAG + GISel: https://godbolt.org/z/63jzb1Yvj At -Os for AArch64, this is a 0.2% code size improvement on CTMark/pairlocalign. Differential Revision: https://reviews.llvm.org/D107929 --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 5 + llvm/include/llvm/CodeGen/TargetLowering.h | 15 +- .../include/llvm/Target/GlobalISel/Combine.td | 9 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 91 +++++ ...izer-combiner-narrow-binop-feeding-add.mir | 332 ++++++++++++++++++ 5 files changed, 447 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-narrow-binop-feeding-add.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 555be8be8885b..8855631859fcf 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -554,6 +554,11 @@ class CombinerHelper { /// Do constant folding when opportunities are exposed after MIR building. bool matchConstantFold(MachineInstr &MI, APInt &MatchInfo); + /// \returns true if it is possible to narrow the width of a scalar binop + /// feeding a G_AND instruction \p MI. + bool matchNarrowBinopFeedingAnd( + MachineInstr &MI, std::function &MatchInfo); + /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index d0328b1405afe..07de68fa1bf21 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -30,6 +30,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -2509,8 +2510,11 @@ class TargetLoweringBase { return false; } - virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const { - return false; + virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const { return false; } + virtual bool isTruncateFree(LLT FromTy, LLT ToTy, const DataLayout &DL, + LLVMContext &Ctx) const { + return isTruncateFree(getApproximateEVTForLLT(FromTy, DL, Ctx), + getApproximateEVTForLLT(ToTy, DL, Ctx)); } virtual bool isProfitableToHoist(Instruction *I) const { return true; } @@ -2586,8 +2590,11 @@ class TargetLoweringBase { return false; } - virtual bool isZExtFree(EVT FromTy, EVT ToTy) const { - return false; + virtual bool isZExtFree(EVT FromTy, EVT ToTy) const { return false; } + virtual bool isZExtFree(LLT FromTy, LLT ToTy, const DataLayout &DL, + LLVMContext &Ctx) const { + return isZExtFree(getApproximateEVTForLLT(FromTy, DL, Ctx), + getApproximateEVTForLLT(ToTy, DL, Ctx)); } /// Return true if sign-extension from FromTy to ToTy is cheaper than diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 13ebc43697c2f..e65073a1d28d0 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -197,6 +197,12 @@ def reduce_shl_of_extend : GICombineRule< [{ return Helper.matchCombineShlOfExtend(*${mi}, ${matchinfo}); }]), (apply [{ Helper.applyCombineShlOfExtend(*${mi}, ${matchinfo}); }])>; +def narrow_binop_feeding_and : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_AND):$root, + [{ return Helper.matchNarrowBinopFeedingAnd(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>; + // [us]itofp(undef) = 0, because the result value is bounded. def undef_to_fp_zero : GICombineRule< (defs root:$root), @@ -698,7 +704,8 @@ def known_bits_simplifications : GICombineGroup<[ redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask, zext_trunc_fold, icmp_to_true_false_known_bits]>; -def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>; +def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend, + narrow_binop_feeding_and]>; def phi_combines : GICombineGroup<[extend_through_phis]>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 23ff22fe3aa66..732b7ed5dd9d6 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4344,6 +4344,97 @@ bool CombinerHelper::matchConstantFold(MachineInstr &MI, APInt &MatchInfo) { return true; } +bool CombinerHelper::matchNarrowBinopFeedingAnd( + MachineInstr &MI, std::function &MatchInfo) { + // Look for a binop feeding into an AND with a mask: + // + // %add = G_ADD %lhs, %rhs + // %and = G_AND %add, 000...11111111 + // + // Check if it's possible to perform the binop at a narrower width and zext + // back to the original width like so: + // + // %narrow_lhs = G_TRUNC %lhs + // %narrow_rhs = G_TRUNC %rhs + // %narrow_add = G_ADD %narrow_lhs, %narrow_rhs + // %new_add = G_ZEXT %narrow_add + // %and = G_AND %new_add, 000...11111111 + // + // This can allow later combines to eliminate the G_AND if it turns out + // that the mask is irrelevant. + assert(MI.getOpcode() == TargetOpcode::G_AND); + Register Dst = MI.getOperand(0).getReg(); + Register AndLHS = MI.getOperand(1).getReg(); + Register AndRHS = MI.getOperand(2).getReg(); + LLT WideTy = MRI.getType(Dst); + + // If the potential binop has more than one use, then it's possible that one + // of those uses will need its full width. + if (!WideTy.isScalar() || !MRI.hasOneNonDBGUse(AndLHS)) + return false; + + // Check if the LHS feeding the AND is impacted by the high bits that we're + // masking out. + // + // e.g. for 64-bit x, y: + // + // add_64(x, y) & 65535 == zext(add_16(trunc(x), trunc(y))) & 65535 + MachineInstr *LHSInst = getDefIgnoringCopies(AndLHS, MRI); + if (!LHSInst) + return false; + unsigned LHSOpc = LHSInst->getOpcode(); + switch (LHSOpc) { + default: + return false; + case TargetOpcode::G_ADD: + case TargetOpcode::G_SUB: + case TargetOpcode::G_MUL: + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: + break; + } + + // Find the mask on the RHS. + auto Cst = getConstantVRegValWithLookThrough(AndRHS, MRI); + if (!Cst) + return false; + auto Mask = Cst->Value; + if (!Mask.isMask()) + return false; + + // No point in combining if there's nothing to truncate. + unsigned NarrowWidth = Mask.countTrailingOnes(); + if (NarrowWidth == WideTy.getSizeInBits()) + return false; + LLT NarrowTy = LLT::scalar(NarrowWidth); + + // Check if adding the zext + truncates could be harmful. + auto &MF = *MI.getMF(); + const auto &TLI = getTargetLowering(); + LLVMContext &Ctx = MF.getFunction().getContext(); + auto &DL = MF.getDataLayout(); + if (!TLI.isTruncateFree(WideTy, NarrowTy, DL, Ctx) || + !TLI.isZExtFree(NarrowTy, WideTy, DL, Ctx)) + return false; + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {NarrowTy, WideTy}}) || + !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {WideTy, NarrowTy}})) + return false; + Register BinOpLHS = LHSInst->getOperand(1).getReg(); + Register BinOpRHS = LHSInst->getOperand(2).getReg(); + MatchInfo = [=, &MI](MachineIRBuilder &B) { + auto NarrowLHS = Builder.buildTrunc(NarrowTy, BinOpLHS); + auto NarrowRHS = Builder.buildTrunc(NarrowTy, BinOpRHS); + auto NarrowBinOp = + Builder.buildInstr(LHSOpc, {NarrowTy}, {NarrowLHS, NarrowRHS}); + auto Ext = Builder.buildZExt(WideTy, NarrowBinOp); + Observer.changingInstr(MI); + MI.getOperand(1).setReg(Ext.getReg(0)); + Observer.changedInstr(MI); + }; + return true; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-narrow-binop-feeding-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-narrow-binop-feeding-add.mir new file mode 100644 index 0000000000000..fb19cda303d36 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-narrow-binop-feeding-add.mir @@ -0,0 +1,332 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +... +--- +name: add_64_mask_32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: add_64_mask_32 + ; CHECK: liveins: $x0, $x1 + ; CHECK: %binop_lhs:_(s64) = COPY $x0 + ; CHECK: %binop_rhs:_(s64) = COPY $x1 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64) + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[TRUNC1]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[ADD]](s32) + ; CHECK: $x0 = COPY [[ZEXT]](s64) + ; CHECK: RET_ReallyLR implicit $x0 + %binop_lhs:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %mask_32:_(s64) = G_CONSTANT i64 4294967295 + %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs + %and:_(s64) = G_AND %binop, %mask_32 + $x0 = COPY %and(s64) + RET_ReallyLR implicit $x0 +... +--- +name: sub_64_mask_32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: sub_64_mask_32 + ; CHECK: liveins: $x0, $x1 + ; CHECK: %binop_lhs:_(s64) = COPY $x0 + ; CHECK: %binop_rhs:_(s64) = COPY $x1 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64) + ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC]], [[TRUNC1]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[SUB]](s32) + ; CHECK: $x0 = COPY [[ZEXT]](s64) + ; CHECK: RET_ReallyLR implicit $x0 + %binop_lhs:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %mask_32:_(s64) = G_CONSTANT i64 4294967295 + %binop:_(s64) = G_SUB %binop_lhs, %binop_rhs + %and:_(s64) = G_AND %binop, %mask_32 + $x0 = COPY %and(s64) + RET_ReallyLR implicit $x0 +... +--- +name: mul_64_mask_32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: mul_64_mask_32 + ; CHECK: liveins: $x0, $x1 + ; CHECK: %binop_lhs:_(s64) = COPY $x0 + ; CHECK: %binop_rhs:_(s64) = COPY $x1 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64) + ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[TRUNC]], [[TRUNC1]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[MUL]](s32) + ; CHECK: $x0 = COPY [[ZEXT]](s64) + ; CHECK: RET_ReallyLR implicit $x0 + %binop_lhs:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %mask_32:_(s64) = G_CONSTANT i64 4294967295 + %binop:_(s64) = G_MUL %binop_lhs, %binop_rhs + %and:_(s64) = G_AND %binop, %mask_32 + $x0 = COPY %and(s64) + RET_ReallyLR implicit $x0 +... +--- +name: and_64_mask_32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: and_64_mask_32 + ; CHECK: liveins: $x0, $x1 + ; CHECK: %binop_lhs:_(s64) = COPY $x0 + ; CHECK: %binop_rhs:_(s64) = COPY $x1 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[AND]](s32) + ; CHECK: $x0 = COPY [[ZEXT]](s64) + ; CHECK: RET_ReallyLR implicit $x0 + %binop_lhs:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %mask_32:_(s64) = G_CONSTANT i64 4294967295 + %binop:_(s64) = G_AND %binop_lhs, %binop_rhs + %and:_(s64) = G_AND %binop, %mask_32 + $x0 = COPY %and(s64) + RET_ReallyLR implicit $x0 +... +--- +name: or_64_mask_32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: or_64_mask_32 + ; CHECK: liveins: $x0, $x1 + ; CHECK: %binop_lhs:_(s64) = COPY $x0 + ; CHECK: %binop_rhs:_(s64) = COPY $x1 + ; CHECK: %mask_32:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK: %binop:_(s64) = G_SUB %binop_lhs, %binop_rhs + ; CHECK: %and:_(s64) = G_OR %binop, %mask_32 + ; CHECK: $x0 = COPY %and(s64) + ; CHECK: RET_ReallyLR implicit $x0 + %binop_lhs:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %mask_32:_(s64) = G_CONSTANT i64 4294967295 + %binop:_(s64) = G_SUB %binop_lhs, %binop_rhs + %and:_(s64) = G_OR %binop, %mask_32 + $x0 = COPY %and(s64) + RET_ReallyLR implicit $x0 +... +--- +name: xor_64_mask_32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: xor_64_mask_32 + ; CHECK: liveins: $x0, $x1 + ; CHECK: %binop_lhs:_(s64) = COPY $x0 + ; CHECK: %binop_rhs:_(s64) = COPY $x1 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64) + ; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[XOR]](s32) + ; CHECK: $x0 = COPY [[ZEXT]](s64) + ; CHECK: RET_ReallyLR implicit $x0 + %binop_lhs:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %mask_32:_(s64) = G_CONSTANT i64 4294967295 + %binop:_(s64) = G_XOR %binop_lhs, %binop_rhs + %and:_(s64) = G_AND %binop, %mask_32 + $x0 = COPY %and(s64) + RET_ReallyLR implicit $x0 +... +--- +name: walk_thru_copy +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: walk_thru_copy + ; CHECK: liveins: $x0, $x1 + ; CHECK: %binop_lhs:_(s64) = COPY $x0 + ; CHECK: %binop_rhs:_(s64) = COPY $x1 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64) + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[TRUNC1]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[ADD]](s32) + ; CHECK: $x0 = COPY [[ZEXT]](s64) + ; CHECK: RET_ReallyLR implicit $x0 + %binop_lhs:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %mask_32:_(s64) = G_CONSTANT i64 4294967295 + %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs + %copy:_(s64) = COPY %binop + %and:_(s64) = G_AND %copy, %mask_32 + $x0 = COPY %and(s64) + RET_ReallyLR implicit $x0 +... +--- +name: dont_combine_zext_not_free_add_64_mask_16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: dont_combine_zext_not_free_add_64_mask_16 + ; CHECK: liveins: $x0, $x1 + ; CHECK: %binop_lhs:_(s64) = COPY $x0 + ; CHECK: %binop_rhs:_(s64) = COPY $x1 + ; CHECK: %mask_16:_(s64) = G_CONSTANT i64 65535 + ; CHECK: %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs + ; CHECK: %and:_(s64) = G_AND %binop, %mask_16 + ; CHECK: $x0 = COPY %and(s64) + ; CHECK: RET_ReallyLR implicit $x0 + %binop_lhs:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %mask_16:_(s64) = G_CONSTANT i64 65535 + %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs + %and:_(s64) = G_AND %binop, %mask_16 + $x0 = COPY %and(s64) + RET_ReallyLR implicit $x0 +... +--- +name: dont_combine_zext_not_free_add_64_mask_8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: dont_combine_zext_not_free_add_64_mask_8 + ; CHECK: liveins: $x0, $x1 + ; CHECK: %binop_lhs:_(s64) = COPY $x0 + ; CHECK: %binop_rhs:_(s64) = COPY $x1 + ; CHECK: %mask_8:_(s64) = G_CONSTANT i64 255 + ; CHECK: %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs + ; CHECK: %and:_(s64) = G_AND %binop, %mask_8 + ; CHECK: $x0 = COPY %and(s64) + ; CHECK: RET_ReallyLR implicit $x0 + %binop_lhs:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %mask_8:_(s64) = G_CONSTANT i64 255 + %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs + %and:_(s64) = G_AND %binop, %mask_8 + $x0 = COPY %and(s64) + RET_ReallyLR implicit $x0 +... +--- +name: dont_combine_not_a_mask +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: dont_combine_not_a_mask + ; CHECK: liveins: $x0, $x1 + ; CHECK: %binop_lhs:_(s64) = COPY $x0 + ; CHECK: %binop_rhs:_(s64) = COPY $x1 + ; CHECK: %not_a_mask:_(s64) = G_CONSTANT i64 26 + ; CHECK: %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs + ; CHECK: %and:_(s64) = G_AND %binop, %not_a_mask + ; CHECK: $x0 = COPY %and(s64) + ; CHECK: RET_ReallyLR implicit $x0 + %binop_lhs:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %not_a_mask:_(s64) = G_CONSTANT i64 26 + %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs + %and:_(s64) = G_AND %binop, %not_a_mask + $x0 = COPY %and(s64) + RET_ReallyLR implicit $x0 +... +--- +name: dont_combine_more_than_one_use +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: dont_combine_more_than_one_use + ; CHECK: liveins: $x0, $x1 + ; CHECK: %binop_lhs:_(s64) = COPY $x0 + ; CHECK: %binop_rhs:_(s64) = COPY $x1 + ; CHECK: %not_a_mask:_(s64) = G_CONSTANT i64 26 + ; CHECK: %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs + ; CHECK: %and:_(s64) = G_AND %binop, %not_a_mask + ; CHECK: %or:_(s64) = G_OR %and, %binop + ; CHECK: $x0 = COPY %or(s64) + ; CHECK: RET_ReallyLR implicit $x0 + %binop_lhs:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %not_a_mask:_(s64) = G_CONSTANT i64 26 + %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs + %and:_(s64) = G_AND %binop, %not_a_mask + %or:_(s64) = G_OR %and, %binop + $x0 = COPY %or(s64) + RET_ReallyLR implicit $x0 +... +--- +name: dont_combine_vector +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $q1 + ; CHECK-LABEL: name: dont_combine_vector + ; CHECK: liveins: $q0, $q1 + ; CHECK: %binop_lhs:_(<2 x s64>) = COPY $q0 + ; CHECK: %binop_rhs:_(<2 x s64>) = COPY $q1 + ; CHECK: %mask_elt:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK: %mask:_(<2 x s64>) = G_BUILD_VECTOR %mask_elt(s64), %mask_elt(s64) + ; CHECK: %binop:_(<2 x s64>) = G_ADD %binop_lhs, %binop_rhs + ; CHECK: %and:_(<2 x s64>) = G_AND %binop, %mask + ; CHECK: $q0 = COPY %and(<2 x s64>) + ; CHECK: RET_ReallyLR implicit $q0 + %binop_lhs:_(<2 x s64>) = COPY $q0 + %binop_rhs:_(<2 x s64>) = COPY $q1 + %mask_elt:_(s64) = G_CONSTANT i64 4294967295 + %mask:_(<2 x s64>) = G_BUILD_VECTOR %mask_elt, %mask_elt + %binop:_(<2 x s64>) = G_ADD %binop_lhs, %binop_rhs + %and:_(<2 x s64>) = G_AND %binop, %mask + $q0 = COPY %and(<2 x s64>) + RET_ReallyLR implicit $q0 +... +--- +name: dont_combine_add_64_mask_64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: dont_combine_add_64_mask_64 + ; CHECK: liveins: $x0, $x1 + ; CHECK: %binop_lhs:_(s64) = COPY $x0 + ; CHECK: %binop_rhs:_(s64) = COPY $x1 + ; CHECK: %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs + ; CHECK: $x0 = COPY %binop(s64) + ; CHECK: RET_ReallyLR implicit $x0 + %binop_lhs:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %mask_64:_(s64) = G_CONSTANT i64 18446744073709551615 + %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs + %and:_(s64) = G_AND %binop, %mask_64 + $x0 = COPY %and(s64) + RET_ReallyLR implicit $x0 +... +--- +name: dont_combine_copy_from_physreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0, $x1 + ; CHECK-LABEL: name: dont_combine_copy_from_physreg + ; CHECK: liveins: $x0, $x1 + ; CHECK: %copy_from_physreg:_(s64) = COPY $x0 + ; CHECK: %mask_32:_(s64) = G_CONSTANT i64 4294967295 + ; CHECK: %and:_(s64) = G_AND %copy_from_physreg, %mask_32 + ; CHECK: $x0 = COPY %and(s64) + ; CHECK: RET_ReallyLR implicit $x0 + %copy_from_physreg:_(s64) = COPY $x0 + %binop_rhs:_(s64) = COPY $x1 + %mask_32:_(s64) = G_CONSTANT i64 4294967295 + %copy:_(s64) = COPY %copy_from_physreg + %and:_(s64) = G_AND %copy, %mask_32 + $x0 = COPY %and(s64) + RET_ReallyLR implicit $x0 From 2af4db7d5cc8e752634a249cf328b27836810245 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Thu, 12 Aug 2021 22:56:43 -0700 Subject: [PATCH 026/700] Migrate DWARFVerifier tests to lit-based yaml instead of gtest with embedded yaml Improves maintainability (edit/modify the tests without recompiling) and error messages (previously the failure would be a gtest failure mentioning nothing of the input or desired text) and the option to improve tests with more checks. (maybe these tests shouldn't all be in separate files - we could probably have DWARF yaml that contains multiple errors while still being fairly maintainable - the various invalid offsets (ref_addr, rnglists, ranges, etc) could probably be all in one test, but for the simple sake of the migration I just did the mechanical thing here) --- .../X86/verify_cu_dont_share_line_table.yaml | 74 ++ .../X86/verify_curanges_incomplete.yaml | 49 + .../X86/verify_duplicate_file_warning.yaml | 70 ++ .../X86/verify_elided_doesnt_fail.yaml | 56 + .../X86/verify_invalid_cu_ref.yaml | 44 + .../X86/verify_invalid_die_range.yaml | 47 + .../X86/verify_invalid_line_file_index.yaml | 66 ++ .../X86/verify_invalid_line_sequence.yaml | 64 + ...invalid_line_table_prologue_dir_index.yaml | 66 ++ .../X86/verify_invalid_ranges.yaml | 33 + .../X86/verify_invalid_ref_addr.yaml | 44 + .../X86/verify_invalid_ref_addr_between.yaml | 44 + .../X86/verify_invalid_rnglists.yaml | 34 + .../X86/verify_invalid_stmt_list.yaml | 33 + .../X86/verify_invalid_strp.yaml | 29 + .../X86/verify_lexical_block_ranges.yaml | 60 + .../X86/verify_nested_functions.yaml | 59 + .../verify_overlapping_function_ranges.yaml | 54 + ...rify_overlapping_lexical_block_ranges.yaml | 71 ++ .../DebugInfo/DWARF/DWARFDebugInfoTest.cpp | 1051 +---------------- 20 files changed, 998 insertions(+), 1050 deletions(-) create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_cu_dont_share_line_table.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_curanges_incomplete.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_duplicate_file_warning.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_elided_doesnt_fail.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_cu_ref.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_die_range.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_file_index.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_sequence.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_table_prologue_dir_index.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ranges.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_rnglists.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_stmt_list.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_strp.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_lexical_block_ranges.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_nested_functions.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_function_ranges.yaml create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_lexical_block_ranges.yaml diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_cu_dont_share_line_table.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_cu_dont_share_line_table.yaml new file mode 100644 index 0000000000000..3d0441ad8ca42 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_cu_dont_share_line_table.yaml @@ -0,0 +1,74 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_line... +# CHECK-NEXT: error: two compile unit DIEs, 0x0000000b and 0x0000001f, have the same DW_AT_stmt_list section offset: + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + - /tmp/foo.c + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_stmt_list + Form: DW_FORM_sec_offset + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - Value: 0x0000000000000000 + - Length: 16 + Version: 4 + AbbrevTableID: 0 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x000000000000000D + - Value: 0x0000000000000000 + debug_line: + - Version: 2 + MinInstLength: 1 + DefaultIsStmt: 1 + LineBase: 251 + LineRange: 14 + OpcodeBase: 13 + StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] + IncludeDirs: + - /tmp + Files: + - Name: main.c + DirIdx: 1 + ModTime: 0 + Length: 0 + Opcodes: + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 4096 + - Opcode: DW_LNS_advance_line + SData: 9 + Data: 4096 + - Opcode: DW_LNS_copy + Data: 4096 + - Opcode: DW_LNS_advance_pc + Data: 256 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 256 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_curanges_incomplete.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_curanges_incomplete.yaml new file mode 100644 index 0000000000000..39fadca9f5ba9 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_curanges_incomplete.yaml @@ -0,0 +1,49 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: error: DIE address ranges are not contained in its parent's ranges: + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x00000002 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000001000 + - Value: 0x0000000000001500 + - Value: 0x0000000000000001 + - AbbrCode: 0x00000002 + Values: + - Value: 0x0000000000001000 + - Value: 0x0000000000002000 + - AbbrCode: 0x00000000 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_duplicate_file_warning.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_duplicate_file_warning.yaml new file mode 100644 index 0000000000000..d9ebd57d13fbb --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_duplicate_file_warning.yaml @@ -0,0 +1,70 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_line... +# CHECK-NEXT: warning: .debug_line[0x00000000].prologue.file_names[2] is a duplicate of file_names[1] + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_stmt_list + Form: DW_FORM_sec_offset + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - Value: 0x0000000000000000 + debug_line: + - Version: 2 + MinInstLength: 1 + DefaultIsStmt: 1 + LineBase: 251 + LineRange: 14 + OpcodeBase: 13 + StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] + IncludeDirs: + - /tmp + Files: + - Name: main.c + DirIdx: 1 + ModTime: 0 + Length: 0 + - Name: main.c + DirIdx: 1 + ModTime: 0 + Length: 0 + Opcodes: + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 4096 + - Opcode: DW_LNS_advance_line + SData: 9 + Data: 4096 + - Opcode: DW_LNS_copy + Data: 4096 + - Opcode: DW_LNS_advance_pc + Data: 16 + - Opcode: DW_LNS_set_file + Data: 1 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 2 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_elided_doesnt_fail.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_elided_doesnt_fail.yaml new file mode 100644 index 0000000000000..ff517f76a40e1 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_elided_doesnt_fail.yaml @@ -0,0 +1,56 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: llvm-dwarfdump -verify %t.o | FileCheck --implicit-check-not=error: %s + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + - main + - elided + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x00000002 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000001000 + - Value: 0x0000000000002000 + - Value: 0x0000000000000001 + - AbbrCode: 0x00000002 + Values: + - Value: 0x000000000000000D + - Value: 0x0000000000001000 + - Value: 0x0000000000002000 + - AbbrCode: 0x00000002 + Values: + - Value: 0x0000000000000012 + - Value: 0x0000000000002000 + - Value: 0x0000000000002000 + - AbbrCode: 0x00000000 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_cu_ref.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_cu_ref.yaml new file mode 100644 index 0000000000000..d3a5ff7c5dcf5 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_cu_ref.yaml @@ -0,0 +1,44 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: error: DW_FORM_ref4 CU offset 0x00001234 is invalid (must be less than CU size of 0x0000001a): + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + - main + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x00000002 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - AbbrCode: 0x00000002 + Values: + - Value: 0x000000000000000D + - Value: 0x0000000000001234 + - AbbrCode: 0x00000000 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_die_range.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_die_range.yaml new file mode 100644 index 0000000000000..31ff1231eab8d --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_die_range.yaml @@ -0,0 +1,47 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: error: Invalid address range + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + - main + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x00000002 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - AbbrCode: 0x00000002 + Values: + - Value: 0x000000000000000D + - Value: 0x0000000000001000 + - Value: 0x0000000000000900 + - AbbrCode: 0x00000000 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_file_index.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_file_index.yaml new file mode 100644 index 0000000000000..1a0862d01efd5 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_file_index.yaml @@ -0,0 +1,66 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_line... +# CHECK-NEXT: error: .debug_line[0x00000000][1] has invalid file index 5 (valid values are [1,1]): + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_stmt_list + Form: DW_FORM_sec_offset + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - Value: 0x0000000000000000 + debug_line: + - Version: 2 + MinInstLength: 1 + DefaultIsStmt: 1 + LineBase: 251 + LineRange: 14 + OpcodeBase: 13 + StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] + IncludeDirs: + - /tmp + Files: + - Name: main.c + DirIdx: 1 + ModTime: 0 + Length: 0 + Opcodes: + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 4096 + - Opcode: DW_LNS_advance_line + SData: 9 + Data: 4096 + - Opcode: DW_LNS_copy + Data: 4096 + - Opcode: DW_LNS_advance_pc + Data: 16 + - Opcode: DW_LNS_set_file + Data: 5 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 5 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_sequence.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_sequence.yaml new file mode 100644 index 0000000000000..c587707b9f7c8 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_sequence.yaml @@ -0,0 +1,64 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_line... +# CHECK-NEXT: error: .debug_line[0x00000000] row[1] decreases in address from previous row: + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_stmt_list + Form: DW_FORM_sec_offset + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - Value: 0x0000000000000000 + debug_line: + - Version: 2 + MinInstLength: 1 + DefaultIsStmt: 1 + LineBase: 251 + LineRange: 14 + OpcodeBase: 13 + StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] + IncludeDirs: + - /tmp + Files: + - Name: main.c + DirIdx: 1 + ModTime: 0 + Length: 0 + Opcodes: + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 4112 + - Opcode: DW_LNS_advance_line + SData: 9 + Data: 4112 + - Opcode: DW_LNS_copy + Data: 4112 + - Opcode: DW_LNS_advance_pc + Data: 18446744073709551600 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 18446744073709551600 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_table_prologue_dir_index.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_table_prologue_dir_index.yaml new file mode 100644 index 0000000000000..1b5e28c0bb50e --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_table_prologue_dir_index.yaml @@ -0,0 +1,66 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_line... +# CHECK-NEXT: error: .debug_line[0x00000000].prologue.file_names[1].dir_idx contains an invalid index: 2 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_stmt_list + Form: DW_FORM_sec_offset + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - Value: 0x0000000000000000 + debug_line: + - Version: 2 + MinInstLength: 1 + DefaultIsStmt: 1 + LineBase: 251 + LineRange: 14 + OpcodeBase: 13 + StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] + IncludeDirs: + - /tmp + Files: + - Name: main.c + DirIdx: 2 + ModTime: 0 + Length: 0 + Opcodes: + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 4096 + - Opcode: DW_LNS_advance_line + SData: 9 + Data: 4096 + - Opcode: DW_LNS_copy + Data: 4096 + - Opcode: DW_LNS_advance_pc + Data: 16 + - Opcode: DW_LNS_set_file + Data: 1 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 1 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ranges.yaml new file mode 100644 index 0000000000000..53317d7fc3c64 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ranges.yaml @@ -0,0 +1,33 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: error: DW_AT_ranges offset is beyond .debug_ranges bounds: 0x00001000 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_ranges + Form: DW_FORM_sec_offset + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - Value: 0x0000000000001000 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr.yaml new file mode 100644 index 0000000000000..b998e1a754dcd --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr.yaml @@ -0,0 +1,44 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: error: DW_FORM_ref_addr offset beyond .debug_info bounds: + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + - main + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x00000002 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_type + Form: DW_FORM_ref_addr + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - AbbrCode: 0x00000002 + Values: + - Value: 0x000000000000000D + - Value: 0x0000000000001234 + - AbbrCode: 0x00000000 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml new file mode 100644 index 0000000000000..605af01311af8 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml @@ -0,0 +1,44 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_info references... +# CHECK-NEXT: error: invalid DIE reference 0x00000011. Offset is in between DIEs: + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + - main + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x00000002 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_type + Form: DW_FORM_ref_addr + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - AbbrCode: 0x00000002 + Values: + - Value: 0x000000000000000D + - Value: 0x0000000000000011 + - AbbrCode: 0x00000000 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_rnglists.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_rnglists.yaml new file mode 100644 index 0000000000000..6328f1f22ec71 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_rnglists.yaml @@ -0,0 +1,34 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: error: DW_AT_ranges offset is beyond .debug_rnglists bounds: 0x00001000 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_ranges + Form: DW_FORM_sec_offset + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - Value: 0x0000000000001000 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_stmt_list.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_stmt_list.yaml new file mode 100644 index 0000000000000..22db1c1ed1a32 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_stmt_list.yaml @@ -0,0 +1,33 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: error: DW_AT_stmt_list offset is beyond .debug_line bounds: 0x00001000 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_stmt_list + Form: DW_FORM_sec_offset + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - Value: 0x0000000000001000 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_strp.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_strp.yaml new file mode 100644 index 0000000000000..128c8ba4c4934 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_strp.yaml @@ -0,0 +1,29 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: error: DW_FORM_strp offset beyond .debug_str bounds: + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000001234 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_lexical_block_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_lexical_block_ranges.yaml new file mode 100644 index 0000000000000..b2e3667b106b7 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_lexical_block_ranges.yaml @@ -0,0 +1,60 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: error: DIE address ranges are not contained in its parent's ranges: + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + - main + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x00000002 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + - Code: 0x00000003 + Tag: DW_TAG_lexical_block + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - AbbrCode: 0x00000002 + Values: + - Value: 0x000000000000000D + - Value: 0x0000000000001000 + - Value: 0x0000000000002000 + - AbbrCode: 0x00000003 + Values: + - Value: 0x0000000000001000 + - Value: 0x0000000000002001 + - AbbrCode: 0x00000000 + - AbbrCode: 0x00000000 diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_nested_functions.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_nested_functions.yaml new file mode 100644 index 0000000000000..275639b2e3a87 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_nested_functions.yaml @@ -0,0 +1,59 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: llvm-dwarfdump -verify %t.o | FileCheck --implicit-check-not=error: %s + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + - main + - nested + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x00000002 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000001000 + - Value: 0x0000000000002000 + - Value: 0x0000000000000001 + - AbbrCode: 0x00000002 + Values: + - Value: 0x000000000000000D + - Value: 0x0000000000001000 + - Value: 0x0000000000001500 + - AbbrCode: 0x00000002 + Values: + - Value: 0x0000000000000012 + - Value: 0x0000000000001500 + - Value: 0x0000000000002000 + - AbbrCode: 0x00000000 + - AbbrCode: 0x00000000 + - AbbrCode: 0x00000000 + diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_function_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_function_ranges.yaml new file mode 100644 index 0000000000000..430e77e4fa8a1 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_function_ranges.yaml @@ -0,0 +1,54 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: error: DIEs have overlapping address ranges + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + - main + - foo + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x00000002 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000000001 + - AbbrCode: 0x00000002 + Values: + - Value: 0x000000000000000D + - Value: 0x0000000000001000 + - Value: 0x0000000000002000 + - AbbrCode: 0x00000002 + Values: + - Value: 0x0000000000000012 + - Value: 0x0000000000001FFF + - Value: 0x0000000000002000 + - AbbrCode: 0x00000000 + diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_lexical_block_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_lexical_block_ranges.yaml new file mode 100644 index 0000000000000..0644f65d4eef3 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_lexical_block_ranges.yaml @@ -0,0 +1,71 @@ +# RUN: yaml2obj %s -o %t.o +# RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s + +# CHECK: Verifying .debug_info Unit Header Chain... +# CHECK-NEXT: error: DIEs have overlapping address ranges + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC +DWARF: + debug_str: + - '' + - /tmp/main.c + - main + debug_abbrev: + - Table: + - Code: 0x00000001 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x00000002 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + - Code: 0x00000003 + Tag: DW_TAG_lexical_block + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + debug_info: + - Version: 4 + AddrSize: 8 + Entries: + - AbbrCode: 0x00000001 + Values: + - Value: 0x0000000000001000 + - Value: 0x0000000000002000 + - Value: 0x0000000000000001 + - AbbrCode: 0x00000002 + Values: + - Value: 0x000000000000000D + - Value: 0x0000000000001000 + - Value: 0x0000000000002000 + - AbbrCode: 0x00000003 + Values: + - Value: 0x0000000000001100 + - Value: 0x0000000000001300 + - AbbrCode: 0x00000003 + Values: + - Value: 0x00000000000012FF + - Value: 0x0000000000001300 + - AbbrCode: 0x00000000 + - AbbrCode: 0x00000000 + diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp index 9a48066ea2eb3..9073a3fc8eba9 100644 --- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp +++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp @@ -36,6 +36,7 @@ using namespace llvm; using namespace dwarf; using namespace utils; +using ::testing::HasSubstr; namespace { @@ -1851,649 +1852,6 @@ TEST(DWARFDebugInfo, TestImplicitConstAbbrevs) { EXPECT_EQ(DIEs.find(Val2)->second, AbbrevPtrVal2); } -void VerifyWarning(DWARFContext &DwarfContext, StringRef Error) { - SmallString<1024> Str; - raw_svector_ostream Strm(Str); - EXPECT_TRUE(DwarfContext.verify(Strm)); - EXPECT_TRUE(Str.str().contains(Error)); -} - -void VerifyError(DWARFContext &DwarfContext, StringRef Error) { - SmallString<1024> Str; - raw_svector_ostream Strm(Str); - EXPECT_FALSE(DwarfContext.verify(Strm)); - EXPECT_TRUE(Str.str().contains(Error)); -} - -void VerifySuccess(DWARFContext &DwarfContext) { - SmallString<1024> Str; - raw_svector_ostream Strm(Str); - EXPECT_TRUE(DwarfContext.verify(Strm)); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyInvalidCURef) { - // Create a single compile unit with a single function that has a DW_AT_type - // that is CU relative. The CU offset is not valid because it is larger than - // the compile unit itself. - - const char *yamldata = R"( - debug_str: - - '' - - /tmp/main.c - - main - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Code: 0x00000002 - Tag: DW_TAG_subprogram - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_type - Form: DW_FORM_ref4 - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - AbbrCode: 0x00000002 - Values: - - Value: 0x000000000000000D - - Value: 0x0000000000001234 - - AbbrCode: 0x00000000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata)); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, "error: DW_FORM_ref4 CU offset 0x00001234 is " - "invalid (must be less than CU size of " - "0x0000001a):"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyInvalidRefAddr) { - // Create a single compile unit with a single function that has an invalid - // DW_AT_type with an invalid .debug_info offset in its DW_FORM_ref_addr. - const char *yamldata = R"( - debug_str: - - '' - - /tmp/main.c - - main - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Code: 0x00000002 - Tag: DW_TAG_subprogram - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_type - Form: DW_FORM_ref_addr - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - AbbrCode: 0x00000002 - Values: - - Value: 0x000000000000000D - - Value: 0x0000000000001234 - - AbbrCode: 0x00000000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata)); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, - "error: DW_FORM_ref_addr offset beyond .debug_info bounds:"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyInvalidRanges) { - // Create a single compile unit with a DW_AT_ranges whose section offset - // isn't valid. - const char *yamldata = R"( - debug_str: - - '' - - /tmp/main.c - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_ranges - Form: DW_FORM_sec_offset - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - Value: 0x0000000000001000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata)); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError( - *DwarfContext, - "error: DW_AT_ranges offset is beyond .debug_ranges bounds: 0x00001000"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyInvalidRnglists) { - // Create a single compile unit with a DW_AT_ranges whose section offset - // isn't valid. - const char *yamldata = R"( - debug_str: - - '' - - /tmp/main.c - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_ranges - Form: DW_FORM_sec_offset - debug_info: - - Version: 5 - UnitType: DW_UT_compile - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - Value: 0x0000000000001000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata)); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, "error: DW_AT_ranges offset is beyond " - ".debug_rnglists bounds: 0x00001000"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyInvalidStmtList) { - // Create a single compile unit with a DW_AT_stmt_list whose section offset - // isn't valid. - const char *yamldata = R"( - debug_str: - - '' - - /tmp/main.c - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_stmt_list - Form: DW_FORM_sec_offset - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - Value: 0x0000000000001000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata)); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError( - *DwarfContext, - "error: DW_AT_stmt_list offset is beyond .debug_line bounds: 0x00001000"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyInvalidStrp) { - // Create a single compile unit with a single function that has an invalid - // DW_FORM_strp for the DW_AT_name. - const char *yamldata = R"( - debug_str: - - '' - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000001234 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata)); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, - "error: DW_FORM_strp offset beyond .debug_str bounds:"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyInvalidRefAddrBetween) { - // Create a single compile unit with a single function that has a DW_AT_type - // with a valid .debug_info offset, but the offset is between two DIEs. - const char *yamldata = R"( - debug_str: - - '' - - /tmp/main.c - - main - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Code: 0x00000002 - Tag: DW_TAG_subprogram - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_type - Form: DW_FORM_ref_addr - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - AbbrCode: 0x00000002 - Values: - - Value: 0x000000000000000D - - Value: 0x0000000000000011 - - AbbrCode: 0x00000000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata)); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError( - *DwarfContext, - "error: invalid DIE reference 0x00000011. Offset is in between DIEs:"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyInvalidLineSequence) { - // Create a single compile unit whose line table has a sequence in it where - // the address decreases. - StringRef yamldata = R"( - debug_str: - - '' - - /tmp/main.c - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_stmt_list - Form: DW_FORM_sec_offset - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - Value: 0x0000000000000000 - debug_line: - - Version: 2 - MinInstLength: 1 - DefaultIsStmt: 1 - LineBase: 251 - LineRange: 14 - OpcodeBase: 13 - StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] - IncludeDirs: - - /tmp - Files: - - Name: main.c - DirIdx: 1 - ModTime: 0 - Length: 0 - Opcodes: - - Opcode: DW_LNS_extended_op - ExtLen: 9 - SubOpcode: DW_LNE_set_address - Data: 4112 - - Opcode: DW_LNS_advance_line - SData: 9 - Data: 4112 - - Opcode: DW_LNS_copy - Data: 4112 - - Opcode: DW_LNS_advance_pc - Data: 18446744073709551600 - - Opcode: DW_LNS_extended_op - ExtLen: 1 - SubOpcode: DW_LNE_end_sequence - Data: 18446744073709551600 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, "error: .debug_line[0x00000000] row[1] decreases " - "in address from previous row:"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyInvalidLineFileIndex) { - // Create a single compile unit whose line table has a line table row with - // an invalid file index. - StringRef yamldata = R"( - debug_str: - - '' - - /tmp/main.c - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_stmt_list - Form: DW_FORM_sec_offset - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - Value: 0x0000000000000000 - debug_line: - - Version: 2 - MinInstLength: 1 - DefaultIsStmt: 1 - LineBase: 251 - LineRange: 14 - OpcodeBase: 13 - StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] - IncludeDirs: - - /tmp - Files: - - Name: main.c - DirIdx: 1 - ModTime: 0 - Length: 0 - Opcodes: - - Opcode: DW_LNS_extended_op - ExtLen: 9 - SubOpcode: DW_LNE_set_address - Data: 4096 - - Opcode: DW_LNS_advance_line - SData: 9 - Data: 4096 - - Opcode: DW_LNS_copy - Data: 4096 - - Opcode: DW_LNS_advance_pc - Data: 16 - - Opcode: DW_LNS_set_file - Data: 5 - - Opcode: DW_LNS_extended_op - ExtLen: 1 - SubOpcode: DW_LNE_end_sequence - Data: 5 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, "error: .debug_line[0x00000000][1] has invalid " - "file index 5 (valid values are [1,1]):"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyInvalidLineTablePorlogueDirIndex) { - // Create a single compile unit whose line table has a prologue with an - // invalid dir index. - StringRef yamldata = R"( - debug_str: - - '' - - /tmp/main.c - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_stmt_list - Form: DW_FORM_sec_offset - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - Value: 0x0000000000000000 - debug_line: - - Version: 2 - MinInstLength: 1 - DefaultIsStmt: 1 - LineBase: 251 - LineRange: 14 - OpcodeBase: 13 - StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] - IncludeDirs: - - /tmp - Files: - - Name: main.c - DirIdx: 2 - ModTime: 0 - Length: 0 - Opcodes: - - Opcode: DW_LNS_extended_op - ExtLen: 9 - SubOpcode: DW_LNE_set_address - Data: 4096 - - Opcode: DW_LNS_advance_line - SData: 9 - Data: 4096 - - Opcode: DW_LNS_copy - Data: 4096 - - Opcode: DW_LNS_advance_pc - Data: 16 - - Opcode: DW_LNS_set_file - Data: 1 - - Opcode: DW_LNS_extended_op - ExtLen: 1 - SubOpcode: DW_LNE_end_sequence - Data: 1 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, - "error: .debug_line[0x00000000].prologue." - "file_names[1].dir_idx contains an invalid index: 2"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyDuplicateFileWarning) { - // Create a single compile unit whose line table has a prologue with an - // invalid dir index. - StringRef yamldata = R"( - debug_str: - - '' - - /tmp/main.c - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_stmt_list - Form: DW_FORM_sec_offset - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - Value: 0x0000000000000000 - debug_line: - - Version: 2 - MinInstLength: 1 - DefaultIsStmt: 1 - LineBase: 251 - LineRange: 14 - OpcodeBase: 13 - StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] - IncludeDirs: - - /tmp - Files: - - Name: main.c - DirIdx: 1 - ModTime: 0 - Length: 0 - - Name: main.c - DirIdx: 1 - ModTime: 0 - Length: 0 - Opcodes: - - Opcode: DW_LNS_extended_op - ExtLen: 9 - SubOpcode: DW_LNE_set_address - Data: 4096 - - Opcode: DW_LNS_advance_line - SData: 9 - Data: 4096 - - Opcode: DW_LNS_copy - Data: 4096 - - Opcode: DW_LNS_advance_pc - Data: 16 - - Opcode: DW_LNS_set_file - Data: 1 - - Opcode: DW_LNS_extended_op - ExtLen: 1 - SubOpcode: DW_LNE_end_sequence - Data: 2 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyWarning(*DwarfContext, - "warning: .debug_line[0x00000000].prologue.file_names[2] is " - "a duplicate of file_names[1]"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyCUDontShareLineTable) { - // Create a two compile units where both compile units share the same - // DW_AT_stmt_list value and verify we report the error correctly. - StringRef yamldata = R"( - debug_str: - - '' - - /tmp/main.c - - /tmp/foo.c - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_stmt_list - Form: DW_FORM_sec_offset - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - Value: 0x0000000000000000 - - Length: 16 - Version: 4 - AbbrevTableID: 0 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x000000000000000D - - Value: 0x0000000000000000 - debug_line: - - Version: 2 - MinInstLength: 1 - DefaultIsStmt: 1 - LineBase: 251 - LineRange: 14 - OpcodeBase: 13 - StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] - IncludeDirs: - - /tmp - Files: - - Name: main.c - DirIdx: 1 - ModTime: 0 - Length: 0 - Opcodes: - - Opcode: DW_LNS_extended_op - ExtLen: 9 - SubOpcode: DW_LNE_set_address - Data: 4096 - - Opcode: DW_LNS_advance_line - SData: 9 - Data: 4096 - - Opcode: DW_LNS_copy - Data: 4096 - - Opcode: DW_LNS_advance_pc - Data: 256 - - Opcode: DW_LNS_extended_op - ExtLen: 1 - SubOpcode: DW_LNE_end_sequence - Data: 256 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, - "error: two compile unit DIEs, 0x0000000b and " - "0x0000001f, have the same DW_AT_stmt_list section " - "offset:"); -} - TEST(DWARFDebugInfo, TestErrorReporting) { Triple Triple("x86_64-pc-linux"); if (!isConfigurationSupported(Triple)) @@ -2528,413 +1886,6 @@ TEST(DWARFDebugInfo, TestErrorReporting) { EXPECT_TRUE(Errors == 2); } -TEST(DWARFDebugInfo, TestDwarfVerifyCURangesIncomplete) { - // Create a single compile unit with a single function. The compile - // unit has a DW_AT_ranges attribute that doesn't fully contain the - // address range of the function. The verification should fail due to - // the CU ranges not containing all of the address ranges of all of the - // functions. - StringRef yamldata = R"( - debug_str: - - '' - - /tmp/main.c - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Code: 0x00000002 - Tag: DW_TAG_subprogram - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000001000 - - Value: 0x0000000000001500 - - Value: 0x0000000000000001 - - AbbrCode: 0x00000002 - Values: - - Value: 0x0000000000001000 - - Value: 0x0000000000002000 - - AbbrCode: 0x00000000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, "error: DIE address ranges are not " - "contained in its parent's ranges:"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyLexicalBlockRanges) { - // Create a single compile unit with a single function that has a lexical - // block whose address range is not contained in the function address range. - StringRef yamldata = R"( - debug_str: - - '' - - /tmp/main.c - - main - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Code: 0x00000002 - Tag: DW_TAG_subprogram - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - - Code: 0x00000003 - Tag: DW_TAG_lexical_block - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - AbbrCode: 0x00000002 - Values: - - Value: 0x000000000000000D - - Value: 0x0000000000001000 - - Value: 0x0000000000002000 - - AbbrCode: 0x00000003 - Values: - - Value: 0x0000000000001000 - - Value: 0x0000000000002001 - - AbbrCode: 0x00000000 - - AbbrCode: 0x00000000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, "error: DIE address ranges are not " - "contained in its parent's ranges:"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyOverlappingFunctionRanges) { - // Create a single compile unit with a two functions that have overlapping - // address ranges. - StringRef yamldata = R"( - debug_str: - - '' - - /tmp/main.c - - main - - foo - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Code: 0x00000002 - Tag: DW_TAG_subprogram - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - AbbrCode: 0x00000002 - Values: - - Value: 0x000000000000000D - - Value: 0x0000000000001000 - - Value: 0x0000000000002000 - - AbbrCode: 0x00000002 - Values: - - Value: 0x0000000000000012 - - Value: 0x0000000000001FFF - - Value: 0x0000000000002000 - - AbbrCode: 0x00000000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, "error: DIEs have overlapping address ranges:"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyOverlappingLexicalBlockRanges) { - // Create a single compile unit with a one function that has two lexical - // blocks with overlapping address ranges. - StringRef yamldata = R"( - debug_str: - - '' - - /tmp/main.c - - main - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Code: 0x00000002 - Tag: DW_TAG_subprogram - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - - Code: 0x00000003 - Tag: DW_TAG_lexical_block - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000001000 - - Value: 0x0000000000002000 - - Value: 0x0000000000000001 - - AbbrCode: 0x00000002 - Values: - - Value: 0x000000000000000D - - Value: 0x0000000000001000 - - Value: 0x0000000000002000 - - AbbrCode: 0x00000003 - Values: - - Value: 0x0000000000001100 - - Value: 0x0000000000001300 - - AbbrCode: 0x00000003 - Values: - - Value: 0x00000000000012FF - - Value: 0x0000000000001300 - - AbbrCode: 0x00000000 - - AbbrCode: 0x00000000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, "error: DIEs have overlapping address ranges:"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyInvalidDIERange) { - // Create a single compile unit with a single function that has an invalid - // address range where the high PC is smaller than the low PC. - StringRef yamldata = R"( - debug_str: - - '' - - /tmp/main.c - - main - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Code: 0x00000002 - Tag: DW_TAG_subprogram - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000000001 - - AbbrCode: 0x00000002 - Values: - - Value: 0x000000000000000D - - Value: 0x0000000000001000 - - Value: 0x0000000000000900 - - AbbrCode: 0x00000000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifyError(*DwarfContext, "error: Invalid address range"); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyElidedDoesntFail) { - // Create a single compile unit with two functions: one that has a valid range - // and one whose low and high PC are the same. When the low and high PC are - // the same, this indicates the function was dead code stripped. We want to - // ensure that verification succeeds. - StringRef yamldata = R"( - debug_str: - - '' - - /tmp/main.c - - main - - elided - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Code: 0x00000002 - Tag: DW_TAG_subprogram - Children: DW_CHILDREN_no - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000001000 - - Value: 0x0000000000002000 - - Value: 0x0000000000000001 - - AbbrCode: 0x00000002 - Values: - - Value: 0x000000000000000D - - Value: 0x0000000000001000 - - Value: 0x0000000000002000 - - AbbrCode: 0x00000002 - Values: - - Value: 0x0000000000000012 - - Value: 0x0000000000002000 - - Value: 0x0000000000002000 - - AbbrCode: 0x00000000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifySuccess(*DwarfContext); -} - -TEST(DWARFDebugInfo, TestDwarfVerifyNestedFunctions) { - // Create a single compile unit with a nested function which is not contained - // in its parent. Although LLVM doesn't generate this, it is valid accoridng - // to the DWARF standard. - StringRef yamldata = R"( - debug_str: - - '' - - /tmp/main.c - - main - - nested - debug_abbrev: - - Table: - - Code: 0x00000001 - Tag: DW_TAG_compile_unit - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Code: 0x00000002 - Tag: DW_TAG_subprogram - Children: DW_CHILDREN_yes - Attributes: - - Attribute: DW_AT_name - Form: DW_FORM_strp - - Attribute: DW_AT_low_pc - Form: DW_FORM_addr - - Attribute: DW_AT_high_pc - Form: DW_FORM_addr - debug_info: - - Version: 4 - AddrSize: 8 - Entries: - - AbbrCode: 0x00000001 - Values: - - Value: 0x0000000000001000 - - Value: 0x0000000000002000 - - Value: 0x0000000000000001 - - AbbrCode: 0x00000002 - Values: - - Value: 0x000000000000000D - - Value: 0x0000000000001000 - - Value: 0x0000000000001500 - - AbbrCode: 0x00000002 - Values: - - Value: 0x0000000000000012 - - Value: 0x0000000000001500 - - Value: 0x0000000000002000 - - AbbrCode: 0x00000000 - - AbbrCode: 0x00000000 - - AbbrCode: 0x00000000 - )"; - auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata); - ASSERT_TRUE((bool)ErrOrSections); - std::unique_ptr DwarfContext = - DWARFContext::create(*ErrOrSections, 8); - VerifySuccess(*DwarfContext); -} - TEST(DWARFDebugInfo, TestDWARFDieRangeInfoContains) { DWARFVerifier::DieRangeInfo Empty; ASSERT_TRUE(Empty.contains(Empty)); From 1f2d40c47f5f8fd01d91d73a1f52044fe1c83225 Mon Sep 17 00:00:00 2001 From: liuke Date: Sat, 14 Aug 2021 10:47:27 +0800 Subject: [PATCH 027/700] [clang-tidy] fix duplicate '{}' in cppcoreguidelines-pro-type-member-init The overload of the constructor will repeatedly fix the member variables that need to be initialized. Removed the duplicate '{}'. ``` struct A { A() {} A(int) {} int _var; // int _var{}{}; <-- wrong fix }; ``` Reviewed By: aaron.ballman Differential Revision: https://reviews.llvm.org/D107641 --- .../cppcoreguidelines/ProTypeMemberInitCheck.cpp | 16 ++++++++++++---- .../cppcoreguidelines/ProTypeMemberInitCheck.h | 5 +++++ .../cppcoreguidelines-pro-type-member-init.cpp | 3 +-- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp index 43812fe17a1c7..a191598415217 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp @@ -433,17 +433,25 @@ void ProTypeMemberInitCheck::checkMissingMemberInitializer( [&](const FieldDecl *F) { OrderedFields.push_back(F); }); // Collect all the fields we need to initialize, including indirect fields. + // It only includes fields that have not been fixed SmallPtrSet AllFieldsToInit; - forEachField(ClassDecl, FieldsToInit, - [&](const FieldDecl *F) { AllFieldsToInit.insert(F); }); - if (AllFieldsToInit.empty()) + forEachField(ClassDecl, FieldsToInit, [&](const FieldDecl *F) { + if (!HasRecordClassMemberSet.contains(F)) { + AllFieldsToInit.insert(F); + HasRecordClassMemberSet.insert(F); + } + }); + if (FieldsToInit.empty()) return; DiagnosticBuilder Diag = diag(Ctor ? Ctor->getBeginLoc() : ClassDecl.getLocation(), "%select{|union }0constructor %select{does not|should}0 initialize " "%select{|one of }0these fields: %1") - << IsUnion << toCommaSeparatedString(OrderedFields, AllFieldsToInit); + << IsUnion << toCommaSeparatedString(OrderedFields, FieldsToInit); + + if (AllFieldsToInit.empty()) + return; // Do not propose fixes for constructors in macros since we cannot place them // correctly. diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h index 5b4144396eab4..af7b14ec68ad9 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h @@ -10,6 +10,7 @@ #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_MEMBER_INIT_H #include "../ClangTidyCheck.h" +#include "llvm/ADT/DenseSet.h" namespace clang { namespace tidy { @@ -72,6 +73,10 @@ class ProTypeMemberInitCheck : public ClangTidyCheck { // instead of brace initialization. Only effective in C++11 mode. Default is // false. bool UseAssignment; + + // Record the member variables that have been initialized to prevent repeated + // initialization. + llvm::DenseSet HasRecordClassMemberSet; }; } // namespace cppcoreguidelines diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-pro-type-member-init.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-pro-type-member-init.cpp index 403f28baf99d4..8cab4fd755752 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-pro-type-member-init.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-pro-type-member-init.cpp @@ -208,9 +208,8 @@ struct PositiveMultipleConstructors { PositiveMultipleConstructors(const PositiveMultipleConstructors &) {} // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: constructor does not initialize these fields: A, B - // FIXME: The fix-its here collide providing an erroneous fix int A, B; - // CHECK-FIXES: int A{}{}{}, B{}{}{}; + // CHECK-FIXES: int A{}, B{}; }; typedef struct { From c4e5425aa579d21530ef1766d7144b38a347f247 Mon Sep 17 00:00:00 2001 From: Anshil Gandhi Date: Fri, 13 Aug 2021 16:32:02 -0600 Subject: [PATCH 028/700] [Remarks] Emit optimization remarks for atomics generating CAS loop Implements ORE in AtomicExpandPass to report atomics generating a compare and swap loop. Differential Revision: https://reviews.llvm.org/D106891 --- .../CodeGenCUDA/atomics-remarks-gfx90a.cu | 16 +++ .../CodeGenOpenCL/atomics-remarks-gfx90a.cl | 46 ++++++++ llvm/lib/CodeGen/AtomicExpandPass.cpp | 23 +++- .../CodeGen/AMDGPU/atomics-remarks-gfx90a.ll | 103 ++++++++++++++++++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 25 +++++ llvm/test/CodeGen/X86/O0-pipeline.ll | 7 +- llvm/test/CodeGen/X86/opt-pipeline.ll | 7 +- 7 files changed, 224 insertions(+), 3 deletions(-) create mode 100644 clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu create mode 100644 clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl create mode 100644 llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll diff --git a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu new file mode 100644 index 0000000000000..96892286fd75e --- /dev/null +++ b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -target-cpu gfx90a -Rpass=atomic-expand -S -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=GFX90A-CAS + +// REQUIRES: amdgpu-registered-target + +#include "Inputs/cuda.h" +#include + +// GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope +// GFX90A-CAS-LABEL: _Z14atomic_add_casPf +// GFX90A-CAS: flat_atomic_cmpswap v0, v[2:3], v[4:5] glc +// GFX90A-CAS: s_cbranch_execnz +__device__ float atomic_add_cas(float *p) { + return __atomic_fetch_add(p, 1.0f, memory_order_relaxed); +} diff --git a/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl new file mode 100644 index 0000000000000..2d8b68f83b9d6 --- /dev/null +++ b/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl @@ -0,0 +1,46 @@ +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -Rpass=atomic-expand -S -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=REMARK + +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -Rpass=atomic-expand -S -emit-llvm -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=GFX90A-CAS + +// REQUIRES: amdgpu-registered-target + +typedef enum memory_order { + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST +} memory_order; + +typedef enum memory_scope { + memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, + memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, + memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, + memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, +#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) + memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP +#endif +} memory_scope; + +// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope [-Rpass=atomic-expand] +// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope [-Rpass=atomic-expand] +// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope [-Rpass=atomic-expand] +// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope [-Rpass=atomic-expand] +// GFX90A-CAS-LABEL: @atomic_cas +// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("workgroup-one-as") monotonic +// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("agent-one-as") monotonic +// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("one-as") monotonic +// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("wavefront-one-as") monotonic +float atomic_cas(__global atomic_float *d, float a) { + float ret1 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group); + float ret2 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_device); + float ret3 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_all_svm_devices); + float ret4 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_sub_group); +} + + + diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 125a3be585cb5..5b5458e1058e8 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/AtomicExpandUtils.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/TargetLowering.h" @@ -58,6 +59,7 @@ namespace { class AtomicExpand: public FunctionPass { const TargetLowering *TLI = nullptr; + OptimizationRemarkEmitter *ORE; public: static char ID; // Pass identification, replacement for typeid @@ -69,6 +71,7 @@ namespace { bool runOnFunction(Function &F) override; private: + void getAnalysisUsage(AnalysisUsage &AU) const override; bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); @@ -165,11 +168,16 @@ static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; } +void AtomicExpand::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); +} + bool AtomicExpand::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; + ORE = &getAnalysis().getORE(); auto &TM = TPC->getTM(); if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) return false; @@ -570,7 +578,9 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, } bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { - switch (TLI->shouldExpandAtomicRMWInIR(AI)) { + LLVMContext &Ctx = AI->getModule()->getContext(); + TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(AI); + switch (Kind) { case TargetLoweringBase::AtomicExpansionKind::None: return false; case TargetLoweringBase::AtomicExpansionKind::LLSC: { @@ -600,6 +610,17 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { expandPartwordAtomicRMW(AI, TargetLoweringBase::AtomicExpansionKind::CmpXChg); } else { + SmallVector SSNs; + Ctx.getSyncScopeNames(SSNs); + auto MemScope = SSNs[AI->getSyncScopeID()].empty() + ? "system" + : SSNs[AI->getSyncScopeID()]; + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Passed", AI->getFunction()) + << "A compare and swap loop was generated for an atomic " + << AI->getOperationName(AI->getOperation()) << " operation at " + << MemScope << " memory scope"; + }); expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); } return true; diff --git a/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll new file mode 100644 index 0000000000000..240963cfe9009 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll @@ -0,0 +1,103 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=atomic-expand \ +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-CAS + +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread-one-as memory scope + +; GFX90A-CAS-LABEL: atomic_add_cas: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_agent: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_agent(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("agent") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_workgroup: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_workgroup(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_wavefront: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_wavefront(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_singlethread: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_singlethread(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_agent_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_agent_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("agent-one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_workgroup_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_workgroup_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_wavefront_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_wavefront_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_singlethread_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_singlethread_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread-one-as") monotonic, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 73909dc918f0a..dba871eee99fd 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -44,6 +44,11 @@ ; GCN-O0-NEXT: Lower OpenCL enqueued blocks ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager +; GCN-O0-NEXT: Dominator Tree Construction +; GCN-O0-NEXT: Natural Loop Information +; GCN-O0-NEXT: Lazy Branch Probability Analysis +; GCN-O0-NEXT: Lazy Block Frequency Analysis +; GCN-O0-NEXT: Optimization Remark Emitter ; GCN-O0-NEXT: Expand Atomic instructions ; GCN-O0-NEXT: Lower constant intrinsics ; GCN-O0-NEXT: Remove unreachable blocks from the CFG @@ -180,6 +185,11 @@ ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces +; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Natural Loop Information +; GCN-O1-NEXT: Lazy Branch Probability Analysis +; GCN-O1-NEXT: Lazy Block Frequency Analysis +; GCN-O1-NEXT: Optimization Remark Emitter ; GCN-O1-NEXT: Expand Atomic instructions ; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Dominator Tree Construction @@ -431,6 +441,11 @@ ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces +; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Natural Loop Information +; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis +; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis +; GCN-O1-OPTS-NEXT: Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Expand Atomic instructions ; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -715,6 +730,11 @@ ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Lazy Branch Probability Analysis +; GCN-O2-NEXT: Lazy Block Frequency Analysis +; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Expand Atomic instructions ; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Dominator Tree Construction @@ -1001,6 +1021,11 @@ ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Lazy Branch Probability Analysis +; GCN-O3-NEXT: Lazy Block Frequency Analysis +; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Expand Atomic instructions ; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index bf3ae61660757..8f0275706996a 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -10,13 +10,18 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Assumption Cache Tracker -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index c809433a2fff8..a480d901160fc 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -16,15 +16,20 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store From 29e11a1aa303cf81b81fdbab74fad4f31e5018d3 Mon Sep 17 00:00:00 2001 From: Anshil Gandhi Date: Fri, 13 Aug 2021 23:58:04 -0600 Subject: [PATCH 029/700] Revert "[Remarks] Emit optimization remarks for atomics generating CAS loop" This reverts commit c4e5425aa579d21530ef1766d7144b38a347f247. --- .../CodeGenCUDA/atomics-remarks-gfx90a.cu | 16 --- .../CodeGenOpenCL/atomics-remarks-gfx90a.cl | 46 -------- llvm/lib/CodeGen/AtomicExpandPass.cpp | 23 +--- .../CodeGen/AMDGPU/atomics-remarks-gfx90a.ll | 103 ------------------ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 25 ----- llvm/test/CodeGen/X86/O0-pipeline.ll | 7 +- llvm/test/CodeGen/X86/opt-pipeline.ll | 7 +- 7 files changed, 3 insertions(+), 224 deletions(-) delete mode 100644 clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu delete mode 100644 clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl delete mode 100644 llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll diff --git a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu deleted file mode 100644 index 96892286fd75e..0000000000000 --- a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu +++ /dev/null @@ -1,16 +0,0 @@ -// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -fcuda-is-device \ -// RUN: -target-cpu gfx90a -Rpass=atomic-expand -S -o - 2>&1 | \ -// RUN: FileCheck %s --check-prefix=GFX90A-CAS - -// REQUIRES: amdgpu-registered-target - -#include "Inputs/cuda.h" -#include - -// GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope -// GFX90A-CAS-LABEL: _Z14atomic_add_casPf -// GFX90A-CAS: flat_atomic_cmpswap v0, v[2:3], v[4:5] glc -// GFX90A-CAS: s_cbranch_execnz -__device__ float atomic_add_cas(float *p) { - return __atomic_fetch_add(p, 1.0f, memory_order_relaxed); -} diff --git a/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl deleted file mode 100644 index 2d8b68f83b9d6..0000000000000 --- a/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl +++ /dev/null @@ -1,46 +0,0 @@ -// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ -// RUN: -Rpass=atomic-expand -S -o - 2>&1 | \ -// RUN: FileCheck %s --check-prefix=REMARK - -// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ -// RUN: -Rpass=atomic-expand -S -emit-llvm -o - 2>&1 | \ -// RUN: FileCheck %s --check-prefix=GFX90A-CAS - -// REQUIRES: amdgpu-registered-target - -typedef enum memory_order { - memory_order_relaxed = __ATOMIC_RELAXED, - memory_order_acquire = __ATOMIC_ACQUIRE, - memory_order_release = __ATOMIC_RELEASE, - memory_order_acq_rel = __ATOMIC_ACQ_REL, - memory_order_seq_cst = __ATOMIC_SEQ_CST -} memory_order; - -typedef enum memory_scope { - memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, - memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, - memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, - memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, -#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) - memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP -#endif -} memory_scope; - -// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope [-Rpass=atomic-expand] -// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope [-Rpass=atomic-expand] -// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope [-Rpass=atomic-expand] -// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope [-Rpass=atomic-expand] -// GFX90A-CAS-LABEL: @atomic_cas -// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("workgroup-one-as") monotonic -// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("agent-one-as") monotonic -// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("one-as") monotonic -// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("wavefront-one-as") monotonic -float atomic_cas(__global atomic_float *d, float a) { - float ret1 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group); - float ret2 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_device); - float ret3 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_all_svm_devices); - float ret4 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_sub_group); -} - - - diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 5b5458e1058e8..125a3be585cb5 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -17,7 +17,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/AtomicExpandUtils.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/TargetLowering.h" @@ -59,7 +58,6 @@ namespace { class AtomicExpand: public FunctionPass { const TargetLowering *TLI = nullptr; - OptimizationRemarkEmitter *ORE; public: static char ID; // Pass identification, replacement for typeid @@ -71,7 +69,6 @@ namespace { bool runOnFunction(Function &F) override; private: - void getAnalysisUsage(AnalysisUsage &AU) const override; bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); @@ -168,16 +165,11 @@ static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; } -void AtomicExpand::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); -} - bool AtomicExpand::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; - ORE = &getAnalysis().getORE(); auto &TM = TPC->getTM(); if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) return false; @@ -578,9 +570,7 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, } bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { - LLVMContext &Ctx = AI->getModule()->getContext(); - TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(AI); - switch (Kind) { + switch (TLI->shouldExpandAtomicRMWInIR(AI)) { case TargetLoweringBase::AtomicExpansionKind::None: return false; case TargetLoweringBase::AtomicExpansionKind::LLSC: { @@ -610,17 +600,6 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { expandPartwordAtomicRMW(AI, TargetLoweringBase::AtomicExpansionKind::CmpXChg); } else { - SmallVector SSNs; - Ctx.getSyncScopeNames(SSNs); - auto MemScope = SSNs[AI->getSyncScopeID()].empty() - ? "system" - : SSNs[AI->getSyncScopeID()]; - ORE->emit([&]() { - return OptimizationRemark(DEBUG_TYPE, "Passed", AI->getFunction()) - << "A compare and swap loop was generated for an atomic " - << AI->getOperationName(AI->getOperation()) << " operation at " - << MemScope << " memory scope"; - }); expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); } return true; diff --git a/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll deleted file mode 100644 index 240963cfe9009..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll +++ /dev/null @@ -1,103 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=atomic-expand \ -; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-CAS - -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread-one-as memory scope - -; GFX90A-CAS-LABEL: atomic_add_cas: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_agent: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_agent(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("agent") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_workgroup: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_workgroup(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_wavefront: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_wavefront(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_singlethread: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_singlethread(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_one_as: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_one_as(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("one-as") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_agent_one_as: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_agent_one_as(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("agent-one-as") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_workgroup_one_as: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_workgroup_one_as(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup-one-as") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_wavefront_one_as: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_wavefront_one_as(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront-one-as") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_singlethread_one_as: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_singlethread_one_as(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread-one-as") monotonic, align 4 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index dba871eee99fd..73909dc918f0a 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -44,11 +44,6 @@ ; GCN-O0-NEXT: Lower OpenCL enqueued blocks ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager -; GCN-O0-NEXT: Dominator Tree Construction -; GCN-O0-NEXT: Natural Loop Information -; GCN-O0-NEXT: Lazy Branch Probability Analysis -; GCN-O0-NEXT: Lazy Block Frequency Analysis -; GCN-O0-NEXT: Optimization Remark Emitter ; GCN-O0-NEXT: Expand Atomic instructions ; GCN-O0-NEXT: Lower constant intrinsics ; GCN-O0-NEXT: Remove unreachable blocks from the CFG @@ -185,11 +180,6 @@ ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces -; GCN-O1-NEXT: Dominator Tree Construction -; GCN-O1-NEXT: Natural Loop Information -; GCN-O1-NEXT: Lazy Branch Probability Analysis -; GCN-O1-NEXT: Lazy Block Frequency Analysis -; GCN-O1-NEXT: Optimization Remark Emitter ; GCN-O1-NEXT: Expand Atomic instructions ; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Dominator Tree Construction @@ -441,11 +431,6 @@ ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces -; GCN-O1-OPTS-NEXT: Dominator Tree Construction -; GCN-O1-OPTS-NEXT: Natural Loop Information -; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis -; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis -; GCN-O1-OPTS-NEXT: Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Expand Atomic instructions ; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -730,11 +715,6 @@ ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces -; GCN-O2-NEXT: Dominator Tree Construction -; GCN-O2-NEXT: Natural Loop Information -; GCN-O2-NEXT: Lazy Branch Probability Analysis -; GCN-O2-NEXT: Lazy Block Frequency Analysis -; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Expand Atomic instructions ; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Dominator Tree Construction @@ -1021,11 +1001,6 @@ ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces -; GCN-O3-NEXT: Dominator Tree Construction -; GCN-O3-NEXT: Natural Loop Information -; GCN-O3-NEXT: Lazy Branch Probability Analysis -; GCN-O3-NEXT: Lazy Block Frequency Analysis -; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Expand Atomic instructions ; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index 8f0275706996a..bf3ae61660757 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -10,18 +10,13 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Assumption Cache Tracker +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index a480d901160fc..c809433a2fff8 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -16,20 +16,15 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store From 632135acae2074f3a3578a719323b776b6d51716 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Sat, 14 Aug 2021 17:49:31 +1000 Subject: [PATCH 030/700] [JITLink][x86-64] Rename BranchPCRel32ToPtrJumpStub(Relaxable -> Bypassable). ELF allows for branch optimizations other than bypass, so rename this edge kind to avoid any confusion. --- llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h | 9 +++++---- llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp | 9 ++++----- llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp | 6 +++--- llvm/lib/ExecutionEngine/JITLink/x86_64.cpp | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h index 9714d6285c24e..b3d8022d063b0 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h @@ -132,7 +132,7 @@ enum EdgeKind_x86_64 : Edge::Kind { /// This edge kind has the same fixup expression as BranchPCRel32, but further /// identifies the call/branch as being to a pointer jump stub. For edges of /// this kind the jump stub should not be bypassed (use - /// BranchPCRel32ToPtrJumpStubRelaxable for that), but the pointer location + /// BranchPCRel32ToPtrJumpStubBypassable for that), but the pointer location /// target may be recorded to allow manipulation at runtime. /// /// Fixup expression: @@ -148,7 +148,8 @@ enum EdgeKind_x86_64 : Edge::Kind { /// /// The edge kind has the same fixup expression as BranchPCRel32ToPtrJumpStub, /// but identifies the call/branch as being to a pointer jump stub that may be - /// bypassed if the ultimate target is within range of the fixup location. + /// bypassed with a direct jump to the ultimate target if the ultimate target + /// is within range of the fixup location. /// /// Fixup expression: /// Fixup <- Target - Fixup + Addend - 4: int32 @@ -157,7 +158,7 @@ enum EdgeKind_x86_64 : Edge::Kind { /// - The result of the fixup expression must fit into an int32, otherwise /// an out-of-range error will be returned. /// - BranchPCRel32ToPtrJumpStubRelaxable, + BranchPCRel32ToPtrJumpStubBypassable, /// A GOT entry getter/constructor, transformed to Delta32 pointing at the GOT /// entry for the original target. @@ -338,7 +339,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E, case BranchPCRel32: case BranchPCRel32ToPtrJumpStub: - case BranchPCRel32ToPtrJumpStubRelaxable: + case BranchPCRel32ToPtrJumpStubBypassable: case PCRel32GOTLoadRelaxable: case PCRel32TLVPLoadRelaxable: { int64_t Value = diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp index 3d81bd20f855e..5fc6a801b256e 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp @@ -107,10 +107,9 @@ class PerGraphGOTAndPLTStubsBuilder_ELF_x86_64 void fixPLTEdge(Edge &E, Symbol &Stub) { assert(E.getKind() == x86_64::BranchPCRel32 && "Not a Branch32 edge?"); - // Set the edge kind to Branch32ToStub. We will use this to check for stub - // optimization opportunities in the optimize ELF_x86_64_GOTAndStubs pass - // below. - E.setKind(x86_64::BranchPCRel32ToPtrJumpStubRelaxable); + // Set the edge kind to Branch32ToPtrJumpStubRelaxable to enable it to be + // optimized when the target is in-range. + E.setKind(x86_64::BranchPCRel32ToPtrJumpStubBypassable); E.setTarget(Stub); } @@ -191,7 +190,7 @@ static Error optimizeELF_x86_64_GOTAndStubs(LinkGraph &G) { dbgs() << "\n"; }); } - } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubRelaxable) { + } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubBypassable) { auto &StubBlock = E.getTarget().getBlock(); assert( StubBlock.getSize() == diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp index 2acd0150c7261..349b299d6a647 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp @@ -466,10 +466,10 @@ class PerGraphGOTAndPLTStubsBuilder_MachO_x86_64 assert(E.getAddend() == 0 && "BranchPCRel32 edge has unexpected addend value"); - // Set the edge kind to BranchPCRel32ToPtrJumpStubRelaxable. We will use + // Set the edge kind to BranchPCRel32ToPtrJumpStubBypassable. We will use // this to check for stub optimization opportunities in the // optimizeMachO_x86_64_GOTAndStubs pass below. - E.setKind(x86_64::BranchPCRel32ToPtrJumpStubRelaxable); + E.setKind(x86_64::BranchPCRel32ToPtrJumpStubBypassable); E.setTarget(Stub); } @@ -535,7 +535,7 @@ static Error optimizeMachO_x86_64_GOTAndStubs(LinkGraph &G) { dbgs() << "\n"; }); } - } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubRelaxable) { + } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubBypassable) { auto &StubBlock = E.getTarget().getBlock(); assert(StubBlock.getSize() == sizeof(x86_64::PointerJumpStubContent) && "Stub block should be stub sized"); diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp index 7d28007b49945..a9ec2775c7806 100644 --- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp @@ -38,8 +38,8 @@ const char *getEdgeKindName(Edge::Kind K) { return "BranchPCRel32"; case BranchPCRel32ToPtrJumpStub: return "BranchPCRel32ToPtrJumpStub"; - case BranchPCRel32ToPtrJumpStubRelaxable: - return "BranchPCRel32ToPtrJumpStubRelaxable"; + case BranchPCRel32ToPtrJumpStubBypassable: + return "BranchPCRel32ToPtrJumpStubBypassable"; case RequestGOTAndTransformToDelta32: return "RequestGOTAndTransformToDelta32"; case RequestGOTAndTransformToDelta64: From 27ea3f16072a911aa8cef04ef3de5b574188c74a Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Sat, 14 Aug 2021 18:27:16 +1000 Subject: [PATCH 031/700] [JITLink][x86-64] Rename *Relaxable edges to *REXRelaxable. The existing relaxable edges all assume a REX prefix. ELF includes non-REX relaxations, so rename these edges to make room for the new kinds. --- .../llvm/ExecutionEngine/JITLink/x86_64.h | 41 ++++++++++--------- .../ExecutionEngine/JITLink/ELF_x86_64.cpp | 12 +++--- .../ExecutionEngine/JITLink/MachO_x86_64.cpp | 15 ++++--- llvm/lib/ExecutionEngine/JITLink/x86_64.cpp | 16 ++++---- .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 6 +-- 5 files changed, 47 insertions(+), 43 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h index b3d8022d063b0..1533f24fe7760 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h @@ -221,7 +221,7 @@ enum EdgeKind_x86_64 : Edge::Kind { /// phase will result in an assert/unreachable during the fixup phase RequestGOTAndTransformToDelta64FromGOT, - /// A PC-relative reference to a GOT entry, relaxable if GOT entry target + /// A PC-relative REX load of a GOT entry, relaxable if GOT entry target /// is in-range of the fixup. /// /// If the GOT entry target is in-range of the fixup then the load from the @@ -234,17 +234,18 @@ enum EdgeKind_x86_64 : Edge::Kind { /// - The result of the fixup expression must fit into an int32, otherwise /// an out-of-range error will be returned. /// - PCRel32GOTLoadRelaxable, + PCRel32GOTLoadREXRelaxable, - /// A GOT entry getter/constructor, transformed to PCRel32ToGOTLoadRelaxable - /// pointing at the GOT entry for the original target. + /// A GOT entry getter/constructor, transformed to + /// PCRel32ToGOTLoadREXRelaxable pointing at the GOT entry for the original + /// target. /// - /// Indicates that this edge should be transformed into a - /// PC32ToGOTLoadRelaxable targeting the GOT entry for the edge's current - /// target, maintaining the same addend. A GOT entry for the target should be - /// created if one does not already exist. + /// Indicates that this edge should be lowered to a PC32ToGOTLoadREXRelaxable + /// targeting the GOT entry for the edge's current target, maintaining the + /// same addend. A GOT entry for the target should be created if one does not + /// already exist. /// - /// Edges of this kind are usually handled by a GOT builder pass inserted by + /// Edges of this kind are usually lowered by a GOT builder pass inserted by /// default. /// /// Fixup expression: @@ -254,12 +255,12 @@ enum EdgeKind_x86_64 : Edge::Kind { /// - *ASSERTION* Failure to handle edges of this kind prior to the fixup /// phase will result in an assert/unreachable during the fixup phase. /// - RequestGOTAndTransformToPCRel32GOTLoadRelaxable, + RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable, - /// A PC-relative reference to a Thread Local Variable Pointer (TLVP) entry, + /// A PC-relative REX load of a Thread Local Variable Pointer (TLVP) entry, /// relaxable if the TLVP entry target is in-range of the fixup. /// - /// If the TLVP entry target is in-range of the fixup then the load frmo the + /// If the TLVP entry target is in-range of the fixup then the load from the /// TLVP may be replaced with a direct memory address calculation. /// /// The target of this edge must be a thread local variable entry of the form @@ -276,15 +277,15 @@ enum EdgeKind_x86_64 : Edge::Kind { /// - The target must be either external, or a TLV entry of the required /// form, otherwise a malformed TLV entry error will be returned. /// - PCRel32TLVPLoadRelaxable, + PCRel32TLVPLoadREXRelaxable, /// A TLVP entry getter/constructor, transformed to - /// Delta32ToTLVPLoadRelaxable. + /// Delta32ToTLVPLoadREXRelaxable. /// /// Indicates that this edge should be transformed into a - /// Delta32ToTLVPLoadRelaxable targeting the TLVP entry for the edge's current - /// target. A TLVP entry for the target should be created if one does not - /// already exist. + /// Delta32ToTLVPLoadREXRelaxable targeting the TLVP entry for the edge's + /// current target. A TLVP entry for the target should be created if one does + /// not already exist. /// /// Fixup expression: /// NONE @@ -293,7 +294,7 @@ enum EdgeKind_x86_64 : Edge::Kind { /// - *ASSERTION* Failure to handle edges of this kind prior to the fixup /// phase will result in an assert/unreachable during the fixup phase. /// - RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable + RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable }; /// Returns a string name for the given x86-64 edge. For debugging purposes @@ -340,8 +341,8 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E, case BranchPCRel32: case BranchPCRel32ToPtrJumpStub: case BranchPCRel32ToPtrJumpStubBypassable: - case PCRel32GOTLoadRelaxable: - case PCRel32TLVPLoadRelaxable: { + case PCRel32GOTLoadREXRelaxable: + case PCRel32TLVPLoadREXRelaxable: { int64_t Value = E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend(); if (LLVM_LIKELY(isInRangeForImmS32(Value))) diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp index 5fc6a801b256e..c47d6305fe06c 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp @@ -53,7 +53,7 @@ class PerGraphGOTAndPLTStubsBuilder_ELF_x86_64 return E.getKind() == x86_64::RequestGOTAndTransformToDelta32 || E.getKind() == x86_64::RequestGOTAndTransformToDelta64 || E.getKind() == - x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable || + x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable || E.getKind() == x86_64::RequestGOTAndTransformToDelta64FromGOT; } @@ -71,8 +71,8 @@ class PerGraphGOTAndPLTStubsBuilder_ELF_x86_64 // optimizeMachO_x86_64_GOTAndStubs pass below. // If it's a GOT64 leave it as is. switch (E.getKind()) { - case x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable: - E.setKind(x86_64::PCRel32GOTLoadRelaxable); + case x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable: + E.setKind(x86_64::PCRel32GOTLoadREXRelaxable); break; case x86_64::RequestGOTAndTransformToDelta64: E.setKind(x86_64::Delta64); @@ -107,7 +107,7 @@ class PerGraphGOTAndPLTStubsBuilder_ELF_x86_64 void fixPLTEdge(Edge &E, Symbol &Stub) { assert(E.getKind() == x86_64::BranchPCRel32 && "Not a Branch32 edge?"); - // Set the edge kind to Branch32ToPtrJumpStubRelaxable to enable it to be + // Set the edge kind to Branch32ToPtrJumpStubBypassable to enable it to be // optimized when the target is in-range. E.setKind(x86_64::BranchPCRel32ToPtrJumpStubBypassable); E.setTarget(Stub); @@ -154,7 +154,7 @@ static Error optimizeELF_x86_64_GOTAndStubs(LinkGraph &G) { for (auto *B : G.blocks()) for (auto &E : B->edges()) - if (E.getKind() == x86_64::PCRel32GOTLoadRelaxable) { + if (E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) { // Replace GOT load with LEA only for MOVQ instructions. constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b}; if (E.getOffset() < 3 || @@ -371,7 +371,7 @@ class ELFLinkGraphBuilder_x86_64 : public ELFLinkGraphBuilder { Kind = x86_64::Pointer64; break; case PCRel32GOTLoad: { - Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable; + Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable; Addend = 0; break; } diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp index 349b299d6a647..4905691b03b53 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp @@ -300,7 +300,7 @@ class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder { else return TargetSymbolOrErr.takeError(); Addend = *(const little32_t *)FixupContent; - Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable; + Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable; if (FixupOffset < 3) return make_error("GOTLD at invalid offset " + formatv("{0}", FixupOffset)); @@ -319,7 +319,10 @@ class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder { else return TargetSymbolOrErr.takeError(); Addend = *(const little32_t *)FixupContent; - Kind = x86_64::RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable; + Kind = x86_64::RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable; + if (FixupOffset < 3) + return make_error("TLV at invalid offset " + + formatv("{0}", FixupOffset)); break; case MachOPointer32: if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum)) @@ -429,7 +432,7 @@ class PerGraphGOTAndPLTStubsBuilder_MachO_x86_64 bool isGOTEdgeToFix(Edge &E) const { return E.getKind() == x86_64::RequestGOTAndTransformToDelta32 || E.getKind() == - x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable; + x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable; } Symbol &createGOTEntry(Symbol &Target) { @@ -442,8 +445,8 @@ class PerGraphGOTAndPLTStubsBuilder_MachO_x86_64 case x86_64::RequestGOTAndTransformToDelta32: E.setKind(x86_64::Delta32); break; - case x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable: - E.setKind(x86_64::PCRel32GOTLoadRelaxable); + case x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable: + E.setKind(x86_64::PCRel32GOTLoadREXRelaxable); break; default: llvm_unreachable("Not a GOT transform edge"); @@ -500,7 +503,7 @@ static Error optimizeMachO_x86_64_GOTAndStubs(LinkGraph &G) { for (auto *B : G.blocks()) for (auto &E : B->edges()) - if (E.getKind() == x86_64::PCRel32GOTLoadRelaxable) { + if (E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) { assert(E.getOffset() >= 3 && "GOT edge occurs too early in block"); // Optimize GOT references. diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp index a9ec2775c7806..ad95fe483ba8c 100644 --- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp @@ -46,14 +46,14 @@ const char *getEdgeKindName(Edge::Kind K) { return "RequestGOTAndTransformToDelta64"; case RequestGOTAndTransformToDelta64FromGOT: return "RequestGOTAndTransformToDelta64FromGOT"; - case PCRel32GOTLoadRelaxable: - return "PCRel32GOTLoadRelaxable"; - case RequestGOTAndTransformToPCRel32GOTLoadRelaxable: - return "RequestGOTAndTransformToPCRel32GOTLoadRelaxable"; - case PCRel32TLVPLoadRelaxable: - return "PCRel32TLVPLoadRelaxable"; - case RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable: - return "RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable"; + case PCRel32GOTLoadREXRelaxable: + return "PCRel32GOTLoadREXRelaxable"; + case RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable: + return "RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable"; + case PCRel32TLVPLoadREXRelaxable: + return "PCRel32TLVPLoadREXRelaxable"; + case RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable: + return "RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable"; default: return getGenericEdgeKindName(static_cast(K)); } diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index 66ef835dc34da..23086d3bfb32b 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -950,9 +950,9 @@ Error MachOPlatform::MachOPlatformPlugin::fixTLVSectionsAndEdges( for (auto *B : G.blocks()) for (auto &E : B->edges()) if (E.getKind() == - jitlink::x86_64::RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable) - E.setKind( - jitlink::x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable); + jitlink::x86_64::RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable) + E.setKind(jitlink::x86_64:: + RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable); return Error::success(); } From 107401002eab206b3a255a2d9bb8c2c3fb54cae5 Mon Sep 17 00:00:00 2001 From: Dawid Jurczak Date: Tue, 10 Aug 2021 12:56:44 +0200 Subject: [PATCH 032/700] [NFC][DSE] Clean up KnownNoReads and MemorySSAScanLimit in DSE Another simple cleanups set in DSE. CheckCache is removed since 1f1145006b32 and in consequence KnownNoReads is useless. Also update description of MemorySSAScanLimit which default value is 150 instead 100. Differential Revision: https://reviews.llvm.org/D107812 --- llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index b315ecf63c96b..1bc4a3886dec1 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -124,7 +124,7 @@ EnablePartialStoreMerging("enable-dse-partial-store-merging", static cl::opt MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(150), cl::Hidden, cl::desc("The number of memory instructions to scan for " - "dead store elimination (default = 100)")); + "dead store elimination (default = 150)")); static cl::opt MemorySSAUpwardsStepLimit( "dse-memoryssa-walklimit", cl::init(90), cl::Hidden, cl::desc("The maximum number of steps while walking upwards to find " @@ -1501,11 +1501,6 @@ struct DSEState { }; PushMemUses(EarlierAccess); - // Optimistically collect all accesses for reads. If we do not find any - // read clobbers, add them to the cache. - SmallPtrSet KnownNoReads; - if (!EarlierMemInst->mayReadFromMemory()) - KnownNoReads.insert(EarlierAccess); // Check if EarlierDef may be read. for (unsigned I = 0; I < WorkList.size(); I++) { MemoryAccess *UseAccess = WorkList[I]; @@ -1518,7 +1513,6 @@ struct DSEState { } --ScanLimit; NumDomMemDefChecks++; - KnownNoReads.insert(UseAccess); if (isa(UseAccess)) { if (any_of(KillingDefs, [this, UseAccess](Instruction *KI) { From 0391165134fc2d19fd13170d87724c3b7bd7366e Mon Sep 17 00:00:00 2001 From: mydeveloperday Date: Sat, 14 Aug 2021 10:29:07 +0100 Subject: [PATCH 033/700] [clang-format] NFC update the ClangFormatStyleOption.rst following previous change clang/docs/tool/dump_format_style.py was not run as part of {D99840} Bring ClangFormatStyleOptions.rst back in line. Reviewed By: HazardyKnusperkeks Differential Revision: https://reviews.llvm.org/D107958 --- clang/docs/ClangFormatStyleOptions.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index 96d89db7a5ccf..5bd5ae1f080e4 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -741,8 +741,7 @@ the configuration (without a prefix: ``Auto``). enum { A, B } myEnum; false: - enum - { + enum { A, B } myEnum; From fe866327c1f98a327767e80290dd08cedeadbfd6 Mon Sep 17 00:00:00 2001 From: mydeveloperday Date: Sat, 14 Aug 2021 12:05:21 +0100 Subject: [PATCH 034/700] [clang-tidy] [PR50069] readability-braces-around-statements doesn't work well with [[likely]] [[unlikely]] https://bugs.llvm.org/show_bug.cgi?id=50069 When clang-tidy sees: ``` if (true) [[unlikely]] { ... } ``` It thinks the braces are missing and add them again. ``` if (true) { [[unlikely]] { ... } } ``` This revision aims to prevent that incorrect code generation Reviewed By: aaron.ballman Differential Revision: https://reviews.llvm.org/D105479 --- .../BracesAroundStatementsCheck.cpp | 4 ++++ ...ty-braces-around-statements-attributes.cpp | 24 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 clang-tools-extra/test/clang-tidy/checkers/readability-braces-around-statements-attributes.cpp diff --git a/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp b/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp index fe25f7a7ccbcc..7dc519c152828 100644 --- a/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp @@ -186,6 +186,10 @@ BracesAroundStatementsCheck::findRParenLoc(const IfOrWhileStmt *S, bool BracesAroundStatementsCheck::checkStmt( const MatchFinder::MatchResult &Result, const Stmt *S, SourceLocation InitialLoc, SourceLocation EndLocHint) { + + while (const auto *AS = dyn_cast(S)) + S = AS->getSubStmt(); + // 1) If there's a corresponding "else" or "while", the check inserts "} " // right before that token. // 2) If there's a multi-line block comment starting on the same line after diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability-braces-around-statements-attributes.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability-braces-around-statements-attributes.cpp new file mode 100644 index 0000000000000..e799614a1f7b0 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/readability-braces-around-statements-attributes.cpp @@ -0,0 +1,24 @@ +// RUN: %check_clang_tidy -std=c++20-or-later %s readability-braces-around-statements %t + +void test(bool b) { + if (b) { + return; + } + if (b) [[likely]] { + // CHECK-FIXES-NOT: if (b) { {{[[][[]}}likely{{[]][]]}} { + return; + } + if (b) [[unlikely]] { + // CHECK-FIXES-NOT: if (b) { {{[[][[]}}unlikely{{[]][]]}} { + return; + } + + if (b) [[likely]] + // CHECK-FIXES: if (b) {{[[][[]}}likely{{[]][]]}} { + return; + // CHECK-FIXES: } + if (b) [[unlikely]] + // CHECK-FIXES: if (b) {{[[][[]}}unlikely{{[]][]]}} { + return; + // CHECK-FIXES: } +} From 012173680f368bff9b4e3db21e1381360422cdc6 Mon Sep 17 00:00:00 2001 From: eopXD Date: Sat, 14 Aug 2021 15:58:05 +0800 Subject: [PATCH 035/700] [LoopIdiom] let the pass deal with runtime memset size The current LIR does not deal with runtime-determined memset-size. This patch utilizes SCEV and check if the PointerStrideSCEV and the MemsetSizeSCEV are equal. Before comparison the pass would try to fold the expression that is already protected by the loop guard. Testcase file `memset-runtime.ll`, `memset-runtime-debug.ll` added. This patch deals with proper loop-idiom. Proceeding patch wants to deal with SCEV-s that are inequal after folding with the loop guards. Reviewed By: lebedev.ri, Whitney Differential Revision: https://reviews.llvm.org/D107353 --- .../Transforms/Scalar/LoopIdiomRecognize.cpp | 96 +++++-- .../LoopIdiom/memset-runtime-debug.ll | 270 ++++++++++++++++++ .../Transforms/LoopIdiom/memset-runtime.ll | 110 +++++++ 3 files changed, 445 insertions(+), 31 deletions(-) create mode 100644 llvm/test/Transforms/LoopIdiom/memset-runtime-debug.ll create mode 100644 llvm/test/Transforms/LoopIdiom/memset-runtime.ll diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index f1dcb10b01bf1..6cf8f5a0b0d96 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -896,8 +896,8 @@ bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI, /// processLoopMemSet - See if this memset can be promoted to a large memset. bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { - // We can only handle non-volatile memsets with a constant size. - if (MSI->isVolatile() || !isa(MSI->getLength())) + // We can only handle non-volatile memsets. + if (MSI->isVolatile()) return false; // If we're not allowed to hack on memset, we fail. @@ -910,23 +910,72 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, // loop, which indicates a strided store. If we have something else, it's a // random store we can't handle. const SCEVAddRecExpr *Ev = dyn_cast(SE->getSCEV(Pointer)); - if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine()) + if (!Ev || Ev->getLoop() != CurLoop) return false; - - // Reject memsets that are so large that they overflow an unsigned. - uint64_t SizeInBytes = cast(MSI->getLength())->getZExtValue(); - if ((SizeInBytes >> 32) != 0) + if (!Ev->isAffine()) { + LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n"); return false; + } - // Check to see if the stride matches the size of the memset. If so, then we - // know that every byte is touched in the loop. - const SCEVConstant *ConstStride = dyn_cast(Ev->getOperand(1)); - if (!ConstStride) + const SCEV *PointerStrideSCEV = Ev->getOperand(1); + const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength()); + if (!PointerStrideSCEV || !MemsetSizeSCEV) return false; - APInt Stride = ConstStride->getAPInt(); - if (SizeInBytes != Stride && SizeInBytes != -Stride) - return false; + bool IsNegStride = false; + const bool IsConstantSize = isa(MSI->getLength()); + + if (IsConstantSize) { + // Memset size is constant. + // Check if the pointer stride matches the memset size. If so, then + // we know that every byte is touched in the loop. + LLVM_DEBUG(dbgs() << " memset size is constant\n"); + uint64_t SizeInBytes = cast(MSI->getLength())->getZExtValue(); + const SCEVConstant *ConstStride = dyn_cast(Ev->getOperand(1)); + if (!ConstStride) + return false; + + APInt Stride = ConstStride->getAPInt(); + if (SizeInBytes != Stride && SizeInBytes != -Stride) + return false; + + IsNegStride = SizeInBytes == -Stride; + } else { + // Memset size is non-constant. + // Check if the pointer stride matches the memset size. + // To be conservative, the pass would not promote pointers that aren't in + // address space zero. Also, the pass only handles memset length and stride + // that are invariant for the top level loop. + LLVM_DEBUG(dbgs() << " memset size is non-constant\n"); + if (Pointer->getType()->getPointerAddressSpace() != 0) { + LLVM_DEBUG(dbgs() << " pointer is not in address space zero, " + << "abort\n"); + return false; + } + if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) { + LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, " + << "abort\n"); + return false; + } + + // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV + IsNegStride = PointerStrideSCEV->isNonConstantNegative(); + const SCEV *PositiveStrideSCEV = + IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV) + : PointerStrideSCEV; + LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n" + << " PositiveStrideSCEV: " << *PositiveStrideSCEV + << "\n"); + + if (PositiveStrideSCEV != MemsetSizeSCEV) { + // TODO: folding can be done to the SCEVs + // The folding is to fold expressions that is covered by the loop guard + // at loop entry. After the folding, compare again and proceed + // optimization if equal. + LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n"); + return false; + } + } // Verify that the memset value is loop invariant. If not, we can't promote // the memset. @@ -936,7 +985,6 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, SmallPtrSet MSIs; MSIs.insert(MSI); - bool IsNegStride = SizeInBytes == -Stride; return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()), MaybeAlign(MSI->getDestAlignment()), SplatValue, MSI, MSIs, Ev, BECount, @@ -1028,20 +1076,6 @@ static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr, /// /// This also maps the SCEV into the provided type and tries to handle the /// computation in a way that will fold cleanly. -static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, - unsigned StoreSize, Loop *CurLoop, - const DataLayout *DL, ScalarEvolution *SE) { - const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE); - - // And scale it based on the store size. - if (StoreSize != 1) { - return SE->getMulExpr(TripCountSCEV, SE->getConstant(IntPtr, StoreSize), - SCEV::FlagNUW); - } - return TripCountSCEV; -} - -/// getNumBytes that takes StoreSize as a SCEV static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE) { @@ -1342,8 +1376,8 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( // Okay, everything is safe, we can transform this! - const SCEV *NumBytesS = - getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); + const SCEV *NumBytesS = getNumBytes( + BECount, IntIdxTy, SE->getConstant(IntIdxTy, StoreSize), CurLoop, DL, SE); Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); diff --git a/llvm/test/Transforms/LoopIdiom/memset-runtime-debug.ll b/llvm/test/Transforms/LoopIdiom/memset-runtime-debug.ll new file mode 100644 index 0000000000000..8ee554eb6d25e --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/memset-runtime-debug.ll @@ -0,0 +1,270 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; REQUIRES: asserts +; RUN: opt < %s -S -debug -passes=loop-idiom 2>&1 | FileCheck %s +; The C code to generate this testcase: +; void test(int *ar, int n, int m) +; { +; long i; +; for (i=0; i +; CHECK-NEXT: PositiveStrideSCEV: (4 + (4 * (sext i32 %m to i64))) +; CHECK-NEXT: SCEV don't match, abort +; CHECK: loop-idiom Scanning: F[NonZeroAddressSpace] Countable Loop %for.cond1.preheader +; CHECK-NEXT: memset size is non-constant +; CHECK-NEXT: pointer is not in address space zero, abort +; CHECK: loop-idiom Scanning: F[NonAffinePointer] Countable Loop %for.body +; CHECK-NEXT: Pointer is not affine, abort + +define void @MemsetSize_LoopVariant(i32* %ar, i32 %n, i32 %m) { +; CHECK-LABEL: @MemsetSize_LoopVariant( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i64 0, [[CONV]] +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[M:%.*]] to i64 +; CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[M]] to i64 +; CHECK-NEXT: [[MUL3:%.*]] = mul i64 [[CONV2]], 4 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_02:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[I_02]], [[CONV1]] +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[AR:%.*]], i64 [[MUL]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ADD_PTR]] to i8* +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[I_02]], [[MUL3]] +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 [[ADD]], i1 false) +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_02]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INC]], [[CONV]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]] +; CHECK: for.cond.for.end_crit_edge: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %conv = sext i32 %n to i64 + %cmp1 = icmp slt i64 0, %conv + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %conv1 = sext i32 %m to i64 + %conv2 = sext i32 %m to i64 + %mul3 = mul i64 %conv2, 4 + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.inc + %i.02 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] + %mul = mul nsw i64 %i.02, %conv1 + %add.ptr = getelementptr inbounds i32, i32* %ar, i64 %mul + %0 = bitcast i32* %add.ptr to i8* + %add = add nsw i64 %i.02, %mul3 + call void @llvm.memset.p0i8.i64(i8* align 4 %0, i8 0, i64 %add, i1 false) + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nuw nsw i64 %i.02, 1 + %cmp = icmp slt i64 %inc, %conv + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.inc + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + ret void +} +; void test(int *ar, int n, int m) +; { +; long i; +; for (i=0; i=0; i--) { +; int *arr = ar + i * m; +; memset(arr, 0, m * sizeof(int)); +; } +; } +define void @For_NegativeStride(i32* %ar, i32 %n, i32 %m) { +; CHECK-LABEL: @For_NegativeStride( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AR1:%.*]] = bitcast i32* [[AR:%.*]] to i8* +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[N:%.*]], 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[SUB]] to i64 +; CHECK-NEXT: [[CMP1:%.*]] = icmp sge i64 [[CONV]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[M:%.*]] to i64 +; CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[M]] to i64 +; CHECK-NEXT: [[MUL3:%.*]] = mul i64 [[CONV2]], 4 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[CONV]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[CONV1]], [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 2 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[AR1]], i8 0, i64 [[TMP2]], i1 false) +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %sub = sub nsw i32 %n, 1 + %conv = sext i32 %sub to i64 + %cmp1 = icmp sge i64 %conv, 0 + br i1 %cmp1, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %conv1 = sext i32 %m to i64 + %conv2 = sext i32 %m to i64 + %mul3 = mul i64 %conv2, 4 + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.inc + %i.02 = phi i64 [ %conv, %for.body.lr.ph ], [ %dec, %for.inc ] + %mul = mul nsw i64 %i.02, %conv1 + %add.ptr = getelementptr inbounds i32, i32* %ar, i64 %mul + %0 = bitcast i32* %add.ptr to i8* + call void @llvm.memset.p0i8.i64(i8* align 4 %0, i8 0, i64 %mul3, i1 false) + br label %for.inc + +for.inc: ; preds = %for.body + %dec = add nsw i64 %i.02, -1 + %cmp = icmp sge i64 %dec, 0 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.inc + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) From f6928cf45516503deb48f8175a982becc579dc8c Mon Sep 17 00:00:00 2001 From: Owen Date: Thu, 12 Aug 2021 06:12:25 -0700 Subject: [PATCH 036/700] [clang-format] Distinguish K&R C function definition and attribute This is a follow-up to https://reviews.llvm.org/D107950 which missed user-defined types in K&R C. Differential Revision: https://reviews.llvm.org/D107961 --- clang/lib/Format/UnwrappedLineParser.cpp | 29 +++++++++++++++++------- clang/unittests/Format/FormatTest.cpp | 14 ++++++++++++ 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 0c4cacab50506..fae77ab48612e 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -14,7 +14,6 @@ #include "UnwrappedLineParser.h" #include "FormatToken.h" -#include "clang/Basic/TokenKinds.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -995,6 +994,13 @@ static bool isJSDeclOrStmt(const AdditionalKeywords &Keywords, Keywords.kw_import, tok::kw_export); } +// Checks whether a token is a type in K&R C (aka C78). +static bool isC78Type(const FormatToken &Tok) { + return Tok.isOneOf(tok::kw_char, tok::kw_short, tok::kw_int, tok::kw_long, + tok::kw_unsigned, tok::kw_float, tok::kw_double, + tok::identifier); +} + // This function checks whether a token starts the first parameter declaration // in a K&R C (aka C78) function definition, e.g.: // int f(a, b) @@ -1006,9 +1012,8 @@ static bool isC78ParameterDecl(const FormatToken *Tok) { if (!Tok) return false; - if (!Tok->isOneOf(tok::kw_int, tok::kw_char, tok::kw_float, tok::kw_double, - tok::kw_struct, tok::kw_union, tok::kw_long, tok::kw_short, - tok::kw_unsigned, tok::kw_register)) + if (!isC78Type(*Tok) && + !Tok->isOneOf(tok::kw_register, tok::kw_struct, tok::kw_union)) return false; Tok = Tok->Previous; @@ -1369,7 +1374,7 @@ void UnwrappedLineParser::parseStructuralElement(bool IsTopLevel) { case tok::r_brace: addUnwrappedLine(); return; - case tok::l_paren: + case tok::l_paren: { parseParens(); // Break the unwrapped line if a K&R C function definition has a parameter // declaration. @@ -1377,14 +1382,22 @@ void UnwrappedLineParser::parseStructuralElement(bool IsTopLevel) { break; if (!Previous || Previous->isNot(tok::identifier)) break; - if (Previous->Previous && Previous->Previous->is(tok::at)) + const FormatToken *PrevPrev = Previous->Previous; + if (!PrevPrev || (!isC78Type(*PrevPrev) && PrevPrev->isNot(tok::star))) break; - if (!Line->Tokens.begin()->Tok->is(tok::kw_typedef) && - isC78ParameterDecl(FormatTok)) { + const unsigned Position = Tokens->getPosition() + 1; + if (Position == AllTokens.size()) + break; + assert(Position < AllTokens.size()); + const FormatToken *Next = AllTokens[Position]; + if (Next && Next->isOneOf(tok::l_paren, tok::semi)) + break; + if (isC78ParameterDecl(FormatTok)) { addUnwrappedLine(); return; } break; + } case tok::kw_operator: nextToken(); if (FormatTok->isBinaryOperator()) diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 1283aa67b3370..383c2cf9e6c45 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -8247,6 +8247,20 @@ TEST_F(FormatTest, ReturnTypeBreakingStyle) { " return a + b < c;\n" "};", Style); + verifyFormat("byte *\n" // Break here. + "f(a)\n" // Break here. + "byte a[];\n" + "{\n" + " return a;\n" + "}", + Style); + verifyFormat("bool f(int a, int) override;\n" + "Bar g(int a, Bar) final;\n" + "Bar h(a, Bar) final;", + Style); + verifyFormat("int\n" + "f(a)", + Style); // The return breaking style doesn't affect: // * function and object definitions with attribute-like macros From 915cc6925980b090564d0db741265dc86163dae1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 14 Aug 2021 08:49:40 -0700 Subject: [PATCH 037/700] [Aarch64] Remove redundant c_str (NFC) Identified with readability-redundant-string-cstr. --- llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index cecb44e9dbff6..2f93e81b070ca 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -3362,7 +3362,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, else if (!IC->haveFeatures(getSTI().getFeatureBits())) { std::string Str("IC " + std::string(IC->Name) + " requires: "); setRequiredFeatureString(IC->getRequiredFeatures(), Str); - return TokError(Str.c_str()); + return TokError(Str); } createSysAlias(IC->Encoding, Operands, S); } else if (Mnemonic == "dc") { @@ -3372,7 +3372,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, else if (!DC->haveFeatures(getSTI().getFeatureBits())) { std::string Str("DC " + std::string(DC->Name) + " requires: "); setRequiredFeatureString(DC->getRequiredFeatures(), Str); - return TokError(Str.c_str()); + return TokError(Str); } createSysAlias(DC->Encoding, Operands, S); } else if (Mnemonic == "at") { @@ -3382,7 +3382,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, else if (!AT->haveFeatures(getSTI().getFeatureBits())) { std::string Str("AT " + std::string(AT->Name) + " requires: "); setRequiredFeatureString(AT->getRequiredFeatures(), Str); - return TokError(Str.c_str()); + return TokError(Str); } createSysAlias(AT->Encoding, Operands, S); } else if (Mnemonic == "tlbi") { @@ -3392,7 +3392,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) { std::string Str("TLBI " + std::string(TLBI->Name) + " requires: "); setRequiredFeatureString(TLBI->getRequiredFeatures(), Str); - return TokError(Str.c_str()); + return TokError(Str); } createSysAlias(TLBI->Encoding, Operands, S); } else if (Mnemonic == "cfp" || Mnemonic == "dvp" || Mnemonic == "cpp") { @@ -3403,7 +3403,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, std::string Str( Mnemonic.upper() + std::string(PRCTX->Name) + " requires: "); setRequiredFeatureString(PRCTX->getRequiredFeatures(), Str); - return TokError(Str.c_str()); + return TokError(Str); } uint16_t PRCTX_Op2 = Mnemonic == "cfp" ? 4 : From 4ec32375bcbd65685c24e168c7cd1d2948f9a3e8 Mon Sep 17 00:00:00 2001 From: luxufan <932494295@qq.com> Date: Sun, 15 Aug 2021 00:30:42 +0800 Subject: [PATCH 038/700] [JITLink] Unify x86-64 MachO and ELF 's optimize GOT/Stub function This patch unify optimizeELF_x86_64_GOTAndStubs and optimizeMachO_x86_64_GOTAndStubs into a pure optimize_x86_64_GOTAndStubs Reviewed By: lhames Differential Revision: https://reviews.llvm.org/D108025 --- .../llvm/ExecutionEngine/JITLink/ELF_x86_64.h | 1 + .../llvm/ExecutionEngine/JITLink/x86_64.h | 44 +++++++++ .../ExecutionEngine/JITLink/ELF_x86_64.cpp | 95 ++++--------------- .../ExecutionEngine/JITLink/MachO_x86_64.cpp | 75 +-------------- llvm/lib/ExecutionEngine/JITLink/x86_64.cpp | 77 +++++++++++++++ 5 files changed, 139 insertions(+), 153 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h index bf6b449a1a7c9..dbd8866a5a28a 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h @@ -24,6 +24,7 @@ enum ELFX86RelocationKind : Edge::Kind { Pointer64, PCRel32, PCRel32GOTLoad, + PCRel32REXGOTLoad, PCRel64GOT, GOTOFF64, GOT64, diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h index 1533f24fe7760..fdf804047db7c 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h @@ -221,6 +221,20 @@ enum EdgeKind_x86_64 : Edge::Kind { /// phase will result in an assert/unreachable during the fixup phase RequestGOTAndTransformToDelta64FromGOT, + /// A PC-relative load of a GOT entry, relaxable if GOT entry target is + /// in-range of the fixup + /// + /// TODO: Explain the optimization + /// + /// Fixup expression + /// Fixup <- Target - (Fixup + 4) + Addend : int32 + /// + /// Errors: + /// - The result of the fixup expression must fit into an int32, otherwise + /// an out-of-range error will be returned. + // + PCRel32GOTLoadRelaxable, + /// A PC-relative REX load of a GOT entry, relaxable if GOT entry target /// is in-range of the fixup. /// @@ -257,6 +271,27 @@ enum EdgeKind_x86_64 : Edge::Kind { /// RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable, + /// A GOT entry getter/constructor, transformed to + /// PCRel32ToGOTLoadRelaxable pointing at the GOT entry for the original + /// target. + /// + /// Indicates that this edge should be lowered to a PC32ToGOTLoadRelaxable + /// targeting the GOT entry for the edge's current target, maintaining the + /// same addend. A GOT entry for the target should be created if one does not + /// already exist. + /// + /// Edges of this kind are usually lowered by a GOT builder pass inserted by + /// default. + /// + /// Fixup expression: + /// NONE + /// + /// Errors: + /// - *ASSERTION* Failure to handle edges of this kind prior to the fixup + /// phase will result in an assert/unreachable during the fixup phase. + /// + RequestGOTAndTransformToPCRel32GOTLoadRelaxable, + /// A PC-relative REX load of a Thread Local Variable Pointer (TLVP) entry, /// relaxable if the TLVP entry target is in-range of the fixup. /// @@ -301,6 +336,14 @@ enum EdgeKind_x86_64 : Edge::Kind { /// only. const char *getEdgeKindName(Edge::Kind K); +/// Optimize the GOT and Stub relocations if the edge target address is in range +/// 1. PCRel32GOTLoadRelaxable. For this edge kind, if the target is in range, +/// then replace GOT load with lea +/// 2. BranchPCRel32ToPtrJumpStubRelaxable. For this edge kind, if the target is +/// in range, replace a indirect jump by plt stub with a direct jump to the +/// target +Error optimize_x86_64_GOTAndStubs(LinkGraph &G); + /// Returns true if the given uint64_t value is in range for a uint32_t. inline bool isInRangeForImmU32(uint64_t Value) { return Value <= std::numeric_limits::max(); @@ -341,6 +384,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E, case BranchPCRel32: case BranchPCRel32ToPtrJumpStub: case BranchPCRel32ToPtrJumpStubBypassable: + case PCRel32GOTLoadRelaxable: case PCRel32GOTLoadREXRelaxable: case PCRel32TLVPLoadREXRelaxable: { int64_t Value = diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp index c47d6305fe06c..ec0077c470620 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp @@ -54,7 +54,9 @@ class PerGraphGOTAndPLTStubsBuilder_ELF_x86_64 E.getKind() == x86_64::RequestGOTAndTransformToDelta64 || E.getKind() == x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable || - E.getKind() == x86_64::RequestGOTAndTransformToDelta64FromGOT; + E.getKind() == x86_64::RequestGOTAndTransformToDelta64FromGOT || + E.getKind() == + x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable; } Symbol &createGOTEntry(Symbol &Target) { @@ -74,6 +76,9 @@ class PerGraphGOTAndPLTStubsBuilder_ELF_x86_64 case x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable: E.setKind(x86_64::PCRel32GOTLoadREXRelaxable); break; + case x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable: + E.setKind(x86_64::PCRel32GOTLoadRelaxable); + break; case x86_64::RequestGOTAndTransformToDelta64: E.setKind(x86_64::Delta64); break; @@ -149,82 +154,6 @@ const uint8_t PerGraphGOTAndPLTStubsBuilder_ELF_x86_64::NullGOTEntryContent[8] = const uint8_t PerGraphGOTAndPLTStubsBuilder_ELF_x86_64::StubContent[6] = { 0xFF, 0x25, 0x00, 0x00, 0x00, 0x00}; -static Error optimizeELF_x86_64_GOTAndStubs(LinkGraph &G) { - LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n"); - - for (auto *B : G.blocks()) - for (auto &E : B->edges()) - if (E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) { - // Replace GOT load with LEA only for MOVQ instructions. - constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b}; - if (E.getOffset() < 3 || - strncmp(B->getContent().data() + E.getOffset() - 3, - reinterpret_cast(MOVQRIPRel), 2) != 0) - continue; - - auto &GOTBlock = E.getTarget().getBlock(); - assert(GOTBlock.getSize() == G.getPointerSize() && - "GOT entry block should be pointer sized"); - assert(GOTBlock.edges_size() == 1 && - "GOT entry should only have one outgoing edge"); - - auto &GOTTarget = GOTBlock.edges().begin()->getTarget(); - JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset(); - JITTargetAddress TargetAddr = GOTTarget.getAddress(); - - int64_t Displacement = TargetAddr - EdgeAddr + 4; - if (Displacement >= std::numeric_limits::min() && - Displacement <= std::numeric_limits::max()) { - // Change the edge kind as we don't go through GOT anymore. This is - // for formal correctness only. Technically, the two relocation kinds - // are resolved the same way. - E.setKind(x86_64::Delta32); - E.setTarget(GOTTarget); - E.setAddend(E.getAddend() - 4); - auto *BlockData = reinterpret_cast( - const_cast(B->getContent().data())); - BlockData[E.getOffset() - 2] = 0x8d; - LLVM_DEBUG({ - dbgs() << " Replaced GOT load wih LEA:\n "; - printEdge(dbgs(), *B, E, getELFX86RelocationKindName(E.getKind())); - dbgs() << "\n"; - }); - } - } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubBypassable) { - auto &StubBlock = E.getTarget().getBlock(); - assert( - StubBlock.getSize() == - sizeof(PerGraphGOTAndPLTStubsBuilder_ELF_x86_64::StubContent) && - "Stub block should be stub sized"); - assert(StubBlock.edges_size() == 1 && - "Stub block should only have one outgoing edge"); - - auto &GOTBlock = StubBlock.edges().begin()->getTarget().getBlock(); - assert(GOTBlock.getSize() == G.getPointerSize() && - "GOT block should be pointer sized"); - assert(GOTBlock.edges_size() == 1 && - "GOT block should only have one outgoing edge"); - - auto &GOTTarget = GOTBlock.edges().begin()->getTarget(); - JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset(); - JITTargetAddress TargetAddr = GOTTarget.getAddress(); - - int64_t Displacement = TargetAddr - EdgeAddr + 4; - if (Displacement >= std::numeric_limits::min() && - Displacement <= std::numeric_limits::max()) { - E.setKind(x86_64::BranchPCRel32); - E.setTarget(GOTTarget); - LLVM_DEBUG({ - dbgs() << " Replaced stub branch with direct branch:\n "; - printEdge(dbgs(), *B, E, getELFX86RelocationKindName(E.getKind())); - dbgs() << "\n"; - }); - } - } - - return Error::success(); -} - static const char *getELFX86_64RelocName(uint32_t Type) { switch (Type) { #define ELF_RELOC(Name, Number) \ @@ -256,8 +185,9 @@ class ELFLinkGraphBuilder_x86_64 : public ELFLinkGraphBuilder { return ELF_x86_64_Edges::ELFX86RelocationKind::Pointer64; case ELF::R_X86_64_GOTPCREL: case ELF::R_X86_64_GOTPCRELX: - case ELF::R_X86_64_REX_GOTPCRELX: return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoad; + case ELF::R_X86_64_REX_GOTPCRELX: + return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32REXGOTLoad; case ELF::R_X86_64_GOTPCREL64: return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel64GOT; case ELF::R_X86_64_GOT64: @@ -371,6 +301,11 @@ class ELFLinkGraphBuilder_x86_64 : public ELFLinkGraphBuilder { Kind = x86_64::Pointer64; break; case PCRel32GOTLoad: { + Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable; + Addend = 0; + break; + } + case PCRel32REXGOTLoad: { Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable; Addend = 0; break; @@ -545,7 +480,7 @@ void link_ELF_x86_64(std::unique_ptr G, identifyELFSectionStartAndEndSymbols)); // Add GOT/Stubs optimizer pass. - Config.PreFixupPasses.push_back(optimizeELF_x86_64_GOTAndStubs); + Config.PreFixupPasses.push_back(x86_64::optimize_x86_64_GOTAndStubs); } if (auto Err = Ctx->modifyPassConfig(*G, Config)) @@ -563,6 +498,8 @@ const char *getELFX86RelocationKindName(Edge::Kind R) { return "PCRel32"; case PCRel32GOTLoad: return "PCRel32GOTLoad"; + case PCRel32REXGOTLoad: + return "PCRel32REXGOTLoad"; case PCRel64GOT: return "PCRel64GOT"; case Delta64: diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp index 4905691b03b53..5e3b1e7b234f0 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp @@ -498,79 +498,6 @@ class PerGraphGOTAndPLTStubsBuilder_MachO_x86_64 } // namespace -static Error optimizeMachO_x86_64_GOTAndStubs(LinkGraph &G) { - LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n"); - - for (auto *B : G.blocks()) - for (auto &E : B->edges()) - if (E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) { - assert(E.getOffset() >= 3 && "GOT edge occurs too early in block"); - - // Optimize GOT references. - auto &GOTBlock = E.getTarget().getBlock(); - assert(GOTBlock.getSize() == G.getPointerSize() && - "GOT entry block should be pointer sized"); - assert(GOTBlock.edges_size() == 1 && - "GOT entry should only have one outgoing edge"); - - auto &GOTTarget = GOTBlock.edges().begin()->getTarget(); - JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset(); - JITTargetAddress TargetAddr = GOTTarget.getAddress(); - - // Check that this is a recognized MOV instruction. - // FIXME: Can we assume this? - constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b}; - if (strncmp(B->getContent().data() + E.getOffset() - 3, - reinterpret_cast(MOVQRIPRel), 2) != 0) - continue; - - int64_t Displacement = TargetAddr - EdgeAddr + 4; - if (Displacement >= std::numeric_limits::min() && - Displacement <= std::numeric_limits::max()) { - E.setTarget(GOTTarget); - E.setKind(x86_64::Delta32); - E.setAddend(E.getAddend() - 4); - char *BlockData = B->getMutableContent(G).data(); - BlockData[E.getOffset() - 2] = (char)0x8d; - LLVM_DEBUG({ - dbgs() << " Replaced GOT load wih LEA:\n "; - printEdge(dbgs(), *B, E, x86_64::getEdgeKindName(E.getKind())); - dbgs() << "\n"; - }); - } - } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubBypassable) { - auto &StubBlock = E.getTarget().getBlock(); - assert(StubBlock.getSize() == sizeof(x86_64::PointerJumpStubContent) && - "Stub block should be stub sized"); - assert(StubBlock.edges_size() == 1 && - "Stub block should only have one outgoing edge"); - - auto &GOTBlock = StubBlock.edges().begin()->getTarget().getBlock(); - assert(GOTBlock.getSize() == G.getPointerSize() && - "GOT block should be pointer sized"); - assert(GOTBlock.edges_size() == 1 && - "GOT block should only have one outgoing edge"); - - auto &GOTTarget = GOTBlock.edges().begin()->getTarget(); - JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset(); - JITTargetAddress TargetAddr = GOTTarget.getAddress(); - - int64_t Displacement = TargetAddr - EdgeAddr + 4; - if (Displacement >= std::numeric_limits::min() && - Displacement <= std::numeric_limits::max()) { - E.setKind(x86_64::BranchPCRel32); - E.setTarget(GOTTarget); - LLVM_DEBUG({ - dbgs() << " Replaced stub branch with direct branch:\n "; - printEdge(dbgs(), *B, E, x86_64::getEdgeKindName(E.getKind())); - dbgs() << "\n"; - }); - } - } - - return Error::success(); -} - namespace llvm { namespace jitlink { @@ -618,7 +545,7 @@ void link_MachO_x86_64(std::unique_ptr G, PerGraphGOTAndPLTStubsBuilder_MachO_x86_64::asPass); // Add GOT/Stubs optimizer pass. - Config.PreFixupPasses.push_back(optimizeMachO_x86_64_GOTAndStubs); + Config.PreFixupPasses.push_back(x86_64::optimize_x86_64_GOTAndStubs); } if (auto Err = Ctx->modifyPassConfig(*G, Config)) diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp index ad95fe483ba8c..354442c4cfd98 100644 --- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp @@ -50,6 +50,10 @@ const char *getEdgeKindName(Edge::Kind K) { return "PCRel32GOTLoadREXRelaxable"; case RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable: return "RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable"; + case PCRel32GOTLoadRelaxable: + return "PCRel32GOTLoadRelaxable"; + case RequestGOTAndTransformToPCRel32GOTLoadRelaxable: + return "RequestGOTAndTransformToPCRel32GOTLoadRelaxable"; case PCRel32TLVPLoadREXRelaxable: return "PCRel32TLVPLoadREXRelaxable"; case RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable: @@ -65,6 +69,79 @@ const char NullPointerContent[PointerSize] = {0x00, 0x00, 0x00, 0x00, const char PointerJumpStubContent[6] = { static_cast(0xFFu), 0x25, 0x00, 0x00, 0x00, 0x00}; +Error optimize_x86_64_GOTAndStubs(LinkGraph &G) { + LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n"); + + for (auto *B : G.blocks()) + for (auto &E : B->edges()) + if (E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) { + // Replace GOT load with LEA only for MOVQ instructions. + assert(E.getOffset() >= 3 && "GOT edge occurs too early in block"); + + constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b}; + if (strncmp(B->getContent().data() + E.getOffset() - 3, + reinterpret_cast(MOVQRIPRel), 2) != 0) + continue; + + auto &GOTBlock = E.getTarget().getBlock(); + assert(GOTBlock.getSize() == G.getPointerSize() && + "GOT entry block should be pointer sized"); + assert(GOTBlock.edges_size() == 1 && + "GOT entry should only have one outgoing edge"); + + auto &GOTTarget = GOTBlock.edges().begin()->getTarget(); + JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset(); + JITTargetAddress TargetAddr = GOTTarget.getAddress(); + + int64_t Displacement = TargetAddr - EdgeAddr + 4; + if (isInRangeForImmS32(Displacement)) { + // Change the edge kind as we don't go through GOT anymore. This is + // for formal correctness only. Technically, the two relocation kinds + // are resolved the same way. + E.setKind(x86_64::Delta32); + E.setTarget(GOTTarget); + E.setAddend(E.getAddend() - 4); + auto *BlockData = reinterpret_cast( + const_cast(B->getContent().data())); + BlockData[E.getOffset() - 2] = 0x8d; + LLVM_DEBUG({ + dbgs() << " Replaced GOT load wih LEA:\n "; + printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind())); + dbgs() << "\n"; + }); + } + } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubBypassable) { + auto &StubBlock = E.getTarget().getBlock(); + assert(StubBlock.getSize() == sizeof(PointerJumpStubContent) && + "Stub block should be stub sized"); + assert(StubBlock.edges_size() == 1 && + "Stub block should only have one outgoing edge"); + + auto &GOTBlock = StubBlock.edges().begin()->getTarget().getBlock(); + assert(GOTBlock.getSize() == G.getPointerSize() && + "GOT block should be pointer sized"); + assert(GOTBlock.edges_size() == 1 && + "GOT block should only have one outgoing edge"); + + auto &GOTTarget = GOTBlock.edges().begin()->getTarget(); + JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset(); + JITTargetAddress TargetAddr = GOTTarget.getAddress(); + + int64_t Displacement = TargetAddr - EdgeAddr + 4; + if (isInRangeForImmS32(Displacement)) { + E.setKind(x86_64::BranchPCRel32); + E.setTarget(GOTTarget); + LLVM_DEBUG({ + dbgs() << " Replaced stub branch with direct branch:\n "; + printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind())); + dbgs() << "\n"; + }); + } + } + + return Error::success(); +} + } // end namespace x86_64 } // end namespace jitlink } // end namespace llvm From e11354c0a40560b2c76945526476f00f65275d71 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sat, 14 Aug 2021 20:54:19 +0200 Subject: [PATCH 039/700] [Tests] Remove explicit -enable-mssa-loop-dependency options (NFC) This is enabled by default. Drop explicit uses in preparation for removing the option. Also drop RUN lines that are now the same (typically modulo a -verify-memoryssa option). --- llvm/test/Analysis/BasicAA/store-promote.ll | 2 +- llvm/test/Analysis/MemorySSA/debugvalue.ll | 2 +- llvm/test/Analysis/MemorySSA/loop-rotate-inv-template.ll | 2 +- llvm/test/Analysis/MemorySSA/loop-rotate-simplified-clone.ll | 2 +- llvm/test/Analysis/MemorySSA/loop-rotate-valuemap.ll | 2 +- llvm/test/Analysis/MemorySSA/loop-unswitch.ll | 2 +- .../Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll | 2 +- llvm/test/Analysis/MemorySSA/nondeterminism.ll | 2 +- llvm/test/Analysis/MemorySSA/pr39197.ll | 2 +- llvm/test/Analysis/MemorySSA/pr40037.ll | 2 +- llvm/test/Analysis/MemorySSA/pr40038.ll | 2 +- llvm/test/Analysis/MemorySSA/pr40509.ll | 2 +- llvm/test/Analysis/MemorySSA/pr40749.ll | 2 +- llvm/test/Analysis/MemorySSA/pr40749_2.ll | 2 +- llvm/test/Analysis/MemorySSA/pr40754.ll | 2 +- llvm/test/Analysis/MemorySSA/pr41254.ll | 2 +- llvm/test/Analysis/MemorySSA/pr41640.ll | 2 +- llvm/test/Analysis/MemorySSA/pr41853.ll | 1 - llvm/test/Analysis/MemorySSA/pr42294.ll | 4 ++-- llvm/test/Analysis/MemorySSA/pr42940.ll | 2 +- llvm/test/Analysis/MemorySSA/pr43044.ll | 2 +- llvm/test/Analysis/MemorySSA/pr43317.ll | 2 +- llvm/test/Analysis/MemorySSA/pr43320.ll | 2 +- llvm/test/Analysis/MemorySSA/pr43426.ll | 2 +- llvm/test/Analysis/MemorySSA/pr43427.ll | 2 +- llvm/test/Analysis/MemorySSA/pr43438.ll | 2 +- llvm/test/Analysis/MemorySSA/pr43493.ll | 2 +- llvm/test/Analysis/MemorySSA/pr43540.ll | 2 +- llvm/test/Analysis/MemorySSA/pr43541.ll | 2 +- llvm/test/Analysis/MemorySSA/pr43641.ll | 2 +- llvm/test/Analysis/MemorySSA/renamephis.ll | 2 +- llvm/test/Analysis/MemorySSA/unreachable.ll | 2 +- llvm/test/Analysis/MemorySSA/update_unroll.ll | 2 +- llvm/test/CodeGen/PowerPC/pr35688.ll | 2 +- llvm/test/Transforms/LICM/argmemonly-call.ll | 2 +- llvm/test/Transforms/LICM/atomics.ll | 2 +- llvm/test/Transforms/LICM/guards.ll | 2 +- llvm/test/Transforms/LICM/hoist-bitcast-load.ll | 3 +-- llvm/test/Transforms/LICM/hoist-debuginvariant.ll | 3 +-- llvm/test/Transforms/LICM/hoist-deref-load.ll | 3 +-- llvm/test/Transforms/LICM/hoist-fast-fdiv.ll | 3 +-- llvm/test/Transforms/LICM/hoist-invariant-load.ll | 3 +-- llvm/test/Transforms/LICM/hoist-nounwind.ll | 5 ++--- llvm/test/Transforms/LICM/hoist-round.ll | 3 +-- llvm/test/Transforms/LICM/hoisting.ll | 3 +-- llvm/test/Transforms/LICM/pr40317.ll | 2 +- llvm/test/Transforms/LICM/pr42969.ll | 2 +- llvm/test/Transforms/LICM/promote-order.ll | 2 +- llvm/test/Transforms/LICM/read-only-calls.ll | 2 +- llvm/test/Transforms/LICM/sink.ll | 5 ++--- llvm/test/Transforms/LICM/sinking.ll | 3 +-- llvm/test/Transforms/LICM/store-hoisting.ll | 2 +- llvm/test/Transforms/LICM/volatile-alias.ll | 3 +-- llvm/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll | 3 +-- llvm/test/Transforms/LoopRotate/PhiRename-1.ll | 3 +-- llvm/test/Transforms/LoopRotate/PhiSelfReference-1.ll | 3 +-- llvm/test/Transforms/LoopRotate/alloca.ll | 2 +- llvm/test/Transforms/LoopRotate/basic.ll | 3 +-- llvm/test/Transforms/LoopRotate/callbr.ll | 1 - llvm/test/Transforms/LoopRotate/catchret.ll | 3 +-- llvm/test/Transforms/LoopRotate/convergent.ll | 3 +-- llvm/test/Transforms/LoopRotate/crash.ll | 3 +-- llvm/test/Transforms/LoopRotate/dbg-value-duplicates.ll | 3 +-- llvm/test/Transforms/LoopRotate/dbgvalue.ll | 3 +-- llvm/test/Transforms/LoopRotate/indirectbr.ll | 3 +-- llvm/test/Transforms/LoopRotate/loopexitinglatch.ll | 3 +-- llvm/test/Transforms/LoopRotate/multiple-exits.ll | 3 +-- llvm/test/Transforms/LoopRotate/noalias.ll | 5 ++--- llvm/test/Transforms/LoopRotate/phi-dbgvalue.ll | 3 +-- llvm/test/Transforms/LoopRotate/phi-duplicate.ll | 2 +- llvm/test/Transforms/LoopRotate/pr22337.ll | 3 +-- llvm/test/Transforms/LoopRotate/pr33701.ll | 3 +-- llvm/test/Transforms/LoopRotate/pr37205.ll | 2 +- llvm/test/Transforms/LoopRotate/preserve-mssa.ll | 2 +- llvm/test/Transforms/LoopRotate/preserve-scev.ll | 3 +-- llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll | 3 +-- llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll | 2 +- llvm/test/Transforms/LoopSimplifyCFG/irreducible_cfg.ll | 3 +-- llvm/test/Transforms/LoopSimplifyCFG/lcssa.ll | 3 +-- llvm/test/Transforms/LoopSimplifyCFG/live_block_marking.ll | 3 +-- llvm/test/Transforms/LoopSimplifyCFG/merge-header.ll | 3 +-- llvm/test/Transforms/LoopSimplifyCFG/mssa_update.ll | 3 +-- .../LoopSimplifyCFG/phi_with_duplicating_inputs.ll | 3 +-- llvm/test/Transforms/LoopSimplifyCFG/pr39783.ll | 2 +- llvm/test/Transforms/LoopSimplifyCFG/scev.ll | 3 +-- llvm/test/Transforms/LoopSimplifyCFG/update_parents.ll | 3 +-- .../Transforms/LoopUnswitch/2006-06-13-SingleEntryPHI.ll | 3 +-- .../Transforms/LoopUnswitch/2006-06-27-DeadSwitchCase.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2007-05-09-Unreachable.ll | 2 +- llvm/test/Transforms/LoopUnswitch/2007-05-09-tl.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2007-07-12-ExitDomInfo.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2007-07-13-DomInfo.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2007-07-18-DomInfo.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2007-08-01-LCSSA.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2008-06-02-DomInfo.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2010-11-18-LCSSA.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2011-09-26-EHCrash.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll | 3 +-- .../LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2012-04-02-IndirectBr.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2012-05-20-Phi.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/2015-09-18-Addrspace.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/LIV-loop-condtion.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/basictest.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/cleanuppad.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/copy-metadata.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/crash.ll | 3 +-- .../LoopUnswitch/elseif-non-exponential-behavior.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/exponential-behavior.ll | 2 +- llvm/test/Transforms/LoopUnswitch/guards.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/infinite-loop.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/invalidate-scev.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/msan.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/pr32818.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/preserve-analyses.ll | 3 +-- .../test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/trivial-unswitch.ll | 3 +-- llvm/test/Transforms/LoopUnswitch/unswitch-equality-undef.ll | 2 +- llvm/test/Transforms/LoopUnswitch/unswitch-select.ll | 3 +-- .../SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll | 3 +-- .../SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll | 3 +-- .../Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll | 3 +-- llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll | 3 +-- .../Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll | 3 +-- .../test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll | 3 +-- .../test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll | 3 +-- llvm/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll | 3 +-- .../test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll | 3 +-- llvm/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll | 3 +-- .../Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll | 3 +-- .../test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll | 3 +-- .../Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll | 3 +-- llvm/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll | 3 +-- .../Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll | 3 +-- llvm/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll | 3 +-- llvm/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll | 3 +-- llvm/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll | 3 +-- llvm/test/Transforms/SimpleLoopUnswitch/crash.ll | 3 +-- .../Transforms/SimpleLoopUnswitch/exponential-behavior.ll | 3 +-- llvm/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll | 3 +-- .../SimpleLoopUnswitch/nontrivial-unswitch-cost.ll | 3 +-- .../Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll | 3 +-- llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll | 3 +-- llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll | 3 +-- 146 files changed, 148 insertions(+), 244 deletions(-) diff --git a/llvm/test/Analysis/BasicAA/store-promote.ll b/llvm/test/Analysis/BasicAA/store-promote.ll index af2aa8d467d15..4bb5d4cfadd66 100644 --- a/llvm/test/Analysis/BasicAA/store-promote.ll +++ b/llvm/test/Analysis/BasicAA/store-promote.ll @@ -3,7 +3,7 @@ ; two pointers, then the load should be hoisted, and the store sunk. ; RUN: opt < %s -basic-aa -licm -enable-mssa-loop-dependency=false -enable-new-pm=0 -S | FileCheck %s -check-prefixes=CHECK,AST -; RUN: opt < %s -basic-aa -licm -enable-mssa-loop-dependency=true -enable-new-pm=0 -S | FileCheck %s -check-prefixes=CHECK,MSSA +; RUN: opt < %s -basic-aa -licm -enable-new-pm=0 -S | FileCheck %s -check-prefixes=CHECK,MSSA ; RUN: opt < %s -aa-pipeline=basic-aa -passes='loop(licm)' -S | FileCheck %s -check-prefixes=CHECK,AST ; RUN: opt < %s -aa-pipeline=basic-aa -passes='loop-mssa(licm)' -S | FileCheck %s -check-prefixes=CHECK,MSSA target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" diff --git a/llvm/test/Analysis/MemorySSA/debugvalue.ll b/llvm/test/Analysis/MemorySSA/debugvalue.ll index 3e73b98279499..a0b9b63ef64a3 100644 --- a/llvm/test/Analysis/MemorySSA/debugvalue.ll +++ b/llvm/test/Analysis/MemorySSA/debugvalue.ll @@ -1,4 +1,4 @@ -; RUN: opt -disable-basic-aa -loop-rotate -enable-mssa-loop-dependency -verify-memoryssa -S %s | FileCheck %s +; RUN: opt -disable-basic-aa -loop-rotate -verify-memoryssa -S %s | FileCheck %s ; REQUIRES: asserts ; CHECK-LABEL: @f_w4_i2 diff --git a/llvm/test/Analysis/MemorySSA/loop-rotate-inv-template.ll b/llvm/test/Analysis/MemorySSA/loop-rotate-inv-template.ll index ec2e8e6e84117..0760d946ccc75 100644 --- a/llvm/test/Analysis/MemorySSA/loop-rotate-inv-template.ll +++ b/llvm/test/Analysis/MemorySSA/loop-rotate-inv-template.ll @@ -1,4 +1,4 @@ -; RUN: opt -disable-output -loop-rotate -enable-mssa-loop-dependency -verify-memoryssa %s +; RUN: opt -disable-output -loop-rotate -verify-memoryssa %s ; REQUIRES: asserts ; Function Attrs: nounwind diff --git a/llvm/test/Analysis/MemorySSA/loop-rotate-simplified-clone.ll b/llvm/test/Analysis/MemorySSA/loop-rotate-simplified-clone.ll index 3d0efc6f6bd4d..8769d60b26635 100644 --- a/llvm/test/Analysis/MemorySSA/loop-rotate-simplified-clone.ll +++ b/llvm/test/Analysis/MemorySSA/loop-rotate-simplified-clone.ll @@ -1,4 +1,4 @@ -; RUN: opt -verify-memoryssa -enable-mssa-loop-dependency -loop-rotate %s -S | FileCheck %s +; RUN: opt -verify-memoryssa -loop-rotate %s -S | FileCheck %s ; REQUIRES: asserts ; CHECK-LABEL: @test() diff --git a/llvm/test/Analysis/MemorySSA/loop-rotate-valuemap.ll b/llvm/test/Analysis/MemorySSA/loop-rotate-valuemap.ll index 1ee5fcabdebdb..75fb0c61180b0 100644 --- a/llvm/test/Analysis/MemorySSA/loop-rotate-valuemap.ll +++ b/llvm/test/Analysis/MemorySSA/loop-rotate-valuemap.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-rotate -enable-mssa-loop-dependency %s -S | FileCheck %s +; RUN: opt -loop-rotate %s -S | FileCheck %s ; REQUIRES: asserts ; Check that loop rotate keeps proper mapping between cloned instructions, diff --git a/llvm/test/Analysis/MemorySSA/loop-unswitch.ll b/llvm/test/Analysis/MemorySSA/loop-unswitch.ll index cc511fec32b35..9a02ea7334c1c 100644 --- a/llvm/test/Analysis/MemorySSA/loop-unswitch.ll +++ b/llvm/test/Analysis/MemorySSA/loop-unswitch.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -simple-loop-unswitch -disable-basic-aa -enable-mssa-loop-dependency -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -simple-loop-unswitch -disable-basic-aa -verify-memoryssa < %s | FileCheck %s ; REQUIRES: asserts target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll b/llvm/test/Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll index d96a3d41cde1b..493e51d581daf 100644 --- a/llvm/test/Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll +++ b/llvm/test/Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-rotate -enable-new-pm=0 -print-memoryssa -disable-output -enable-mssa-loop-dependency -verify-memoryssa %s 2>&1 | FileCheck %s +; RUN: opt -loop-rotate -enable-new-pm=0 -print-memoryssa -disable-output -verify-memoryssa %s 2>&1 | FileCheck %s ; RUN: opt -passes='loop-mssa(loop-rotate),print' -disable-output -verify-memoryssa %s 2>&1 | FileCheck %s ; REQUIRES: asserts diff --git a/llvm/test/Analysis/MemorySSA/nondeterminism.ll b/llvm/test/Analysis/MemorySSA/nondeterminism.ll index 0bb3df30b5878..230c6e61fb33e 100644 --- a/llvm/test/Analysis/MemorySSA/nondeterminism.ll +++ b/llvm/test/Analysis/MemorySSA/nondeterminism.ll @@ -1,4 +1,4 @@ -; RUN: opt -simplifycfg -enable-mssa-loop-dependency -S --preserve-ll-uselistorder %s | FileCheck %s +; RUN: opt -simplifycfg -S --preserve-ll-uselistorder %s | FileCheck %s ; REQUIRES: x86-registered-target ; CHECK-LABEL: @n ; CHECK: uselistorder i16 0, { 3, 2, 4, 1, 5, 0, 6 } diff --git a/llvm/test/Analysis/MemorySSA/pr39197.ll b/llvm/test/Analysis/MemorySSA/pr39197.ll index 115a7748dcf5c..068d4f6fc438f 100644 --- a/llvm/test/Analysis/MemorySSA/pr39197.ll +++ b/llvm/test/Analysis/MemorySSA/pr39197.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=s390x-linux-gnu -mcpu=z13 -enable-mssa-loop-dependency -verify-memoryssa -sroa -globalopt -function-attrs -simplifycfg -licm -simple-loop-unswitch %s -S | FileCheck %s +; RUN: opt -mtriple=s390x-linux-gnu -mcpu=z13 -verify-memoryssa -sroa -globalopt -function-attrs -simplifycfg -licm -simple-loop-unswitch %s -S | FileCheck %s ; REQUIRES: asserts target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" diff --git a/llvm/test/Analysis/MemorySSA/pr40037.ll b/llvm/test/Analysis/MemorySSA/pr40037.ll index a123a06ac50b0..91dcb79541b31 100644 --- a/llvm/test/Analysis/MemorySSA/pr40037.ll +++ b/llvm/test/Analysis/MemorySSA/pr40037.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -S -simple-loop-unswitch -enable-mssa-loop-dependency -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -simple-loop-unswitch -verify-memoryssa < %s | FileCheck %s target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" target triple = "s390x-ibm-linux" diff --git a/llvm/test/Analysis/MemorySSA/pr40038.ll b/llvm/test/Analysis/MemorySSA/pr40038.ll index 844ede33cd88c..75032b26ef2a7 100644 --- a/llvm/test/Analysis/MemorySSA/pr40038.ll +++ b/llvm/test/Analysis/MemorySSA/pr40038.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -S -mtriple=systemz-unknown -mcpu=z13 -O3 -enable-mssa-loop-dependency -enable-simple-loop-unswitch -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -mtriple=systemz-unknown -mcpu=z13 -O3 -enable-simple-loop-unswitch -verify-memoryssa < %s | FileCheck %s target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" target triple = "s390x-ibm-linux" diff --git a/llvm/test/Analysis/MemorySSA/pr40509.ll b/llvm/test/Analysis/MemorySSA/pr40509.ll index 1dbb6cfba3f61..55a2ad88be54b 100644 --- a/llvm/test/Analysis/MemorySSA/pr40509.ll +++ b/llvm/test/Analysis/MemorySSA/pr40509.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -O3 -enable-mssa-loop-dependency -disable-output %s +; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -O3 -disable-output %s ; During transform to LCSSA, an access becomes obfuscated to: ; (2 = phi (phi(val), val)), which BasicAA fails to analyze. diff --git a/llvm/test/Analysis/MemorySSA/pr40749.ll b/llvm/test/Analysis/MemorySSA/pr40749.ll index fe51f1038bb8b..461129eabb7c5 100644 --- a/llvm/test/Analysis/MemorySSA/pr40749.ll +++ b/llvm/test/Analysis/MemorySSA/pr40749.ll @@ -1,4 +1,4 @@ -; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa -S < %s | FileCheck %s +; RUN: opt -licm -verify-memoryssa -S < %s | FileCheck %s ; REQUIRES: asserts target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" diff --git a/llvm/test/Analysis/MemorySSA/pr40749_2.ll b/llvm/test/Analysis/MemorySSA/pr40749_2.ll index 50f1c4f71c8bc..3ebeb54963efc 100644 --- a/llvm/test/Analysis/MemorySSA/pr40749_2.ll +++ b/llvm/test/Analysis/MemorySSA/pr40749_2.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -licm -simple-loop-unswitch -enable-mssa-loop-dependency -verify-memoryssa %s | FileCheck %s +; RUN: opt -S -licm -simple-loop-unswitch -verify-memoryssa %s | FileCheck %s ; REQUIRES: asserts target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" target triple = "s390x-ibm-linux" diff --git a/llvm/test/Analysis/MemorySSA/pr40754.ll b/llvm/test/Analysis/MemorySSA/pr40754.ll index 3262a0cdd46e8..ce8956bfd1b17 100644 --- a/llvm/test/Analysis/MemorySSA/pr40754.ll +++ b/llvm/test/Analysis/MemorySSA/pr40754.ll @@ -1,4 +1,4 @@ -; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa -S < %s | FileCheck %s +; RUN: opt -licm -verify-memoryssa -S < %s | FileCheck %s ; REQUIRES: asserts target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" diff --git a/llvm/test/Analysis/MemorySSA/pr41254.ll b/llvm/test/Analysis/MemorySSA/pr41254.ll index c8b21449bce3b..debb18dc07790 100644 --- a/llvm/test/Analysis/MemorySSA/pr41254.ll +++ b/llvm/test/Analysis/MemorySSA/pr41254.ll @@ -1,4 +1,4 @@ -; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa -S < %s | FileCheck %s +; RUN: opt -licm -verify-memoryssa -S < %s | FileCheck %s ; REQUIRES: asserts target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" diff --git a/llvm/test/Analysis/MemorySSA/pr41640.ll b/llvm/test/Analysis/MemorySSA/pr41640.ll index cf25535819530..f72013ade8370 100644 --- a/llvm/test/Analysis/MemorySSA/pr41640.ll +++ b/llvm/test/Analysis/MemorySSA/pr41640.ll @@ -1,4 +1,4 @@ -; RUN: opt -disable-output -licm -enable-new-pm=0 -print-memoryssa -enable-mssa-loop-dependency=true < %s 2>&1 | FileCheck %s +; RUN: opt -disable-output -licm -enable-new-pm=0 -print-memoryssa < %s 2>&1 | FileCheck %s ; RUN: opt -disable-output -passes='loop-mssa(licm),print' < %s 2>&1 | FileCheck %s target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" target triple = "s390x-ibm-linux" diff --git a/llvm/test/Analysis/MemorySSA/pr41853.ll b/llvm/test/Analysis/MemorySSA/pr41853.ll index f7bf21c9f90f2..d28b2c155aad7 100644 --- a/llvm/test/Analysis/MemorySSA/pr41853.ll +++ b/llvm/test/Analysis/MemorySSA/pr41853.ll @@ -1,5 +1,4 @@ ; RUN: opt -S -memoryssa -loop-simplify -early-cse-memssa -earlycse-debug-hash -verify-memoryssa %s | FileCheck %s -; RUN: opt -S -memoryssa -loop-simplify -early-cse-memssa -enable-mssa-loop-dependency -verify-memoryssa %s | FileCheck %s ; REQUIRES: asserts target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Analysis/MemorySSA/pr42294.ll b/llvm/test/Analysis/MemorySSA/pr42294.ll index ce278fc3eabcf..e5a687afcab71 100644 --- a/llvm/test/Analysis/MemorySSA/pr42294.ll +++ b/llvm/test/Analysis/MemorySSA/pr42294.ll @@ -1,7 +1,7 @@ ; REQUIRES: asserts -; RUN: opt -loop-rotate -licm %s -disable-output -enable-mssa-loop-dependency=true -debug-only=licm 2>&1 | FileCheck %s -check-prefix=LICM +; RUN: opt -loop-rotate -licm %s -disable-output -debug-only=licm 2>&1 | FileCheck %s -check-prefix=LICM ; RUN: opt -loop-rotate -licm %s -disable-output -enable-mssa-loop-dependency=false -debug-only=licm 2>&1 | FileCheck %s -check-prefix=LICM -; RUN: opt -loop-rotate -licm %s -S -enable-mssa-loop-dependency=true | FileCheck %s +; RUN: opt -loop-rotate -licm %s -S | FileCheck %s ; RUN: opt -loop-rotate -licm %s -S -enable-mssa-loop-dependency=false | FileCheck %s ; LICM: Using diff --git a/llvm/test/Analysis/MemorySSA/pr42940.ll b/llvm/test/Analysis/MemorySSA/pr42940.ll index ccd3007c45ad1..fab8c48a2b456 100644 --- a/llvm/test/Analysis/MemorySSA/pr42940.ll +++ b/llvm/test/Analysis/MemorySSA/pr42940.ll @@ -1,4 +1,4 @@ -; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa -S %s | FileCheck %s +; RUN: opt -licm -verify-memoryssa -S %s | FileCheck %s ; REQUIRES: asserts target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" diff --git a/llvm/test/Analysis/MemorySSA/pr43044.ll b/llvm/test/Analysis/MemorySSA/pr43044.ll index a0b2cf7b9f742..cf1a759e0b6d3 100644 --- a/llvm/test/Analysis/MemorySSA/pr43044.ll +++ b/llvm/test/Analysis/MemorySSA/pr43044.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-rotate -licm -enable-mssa-loop-dependency -verify-memoryssa %s -S | FileCheck %s +; RUN: opt -loop-rotate -licm -verify-memoryssa %s -S | FileCheck %s ; REQUIRES: asserts target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" diff --git a/llvm/test/Analysis/MemorySSA/pr43317.ll b/llvm/test/Analysis/MemorySSA/pr43317.ll index eb46252568536..36b60d0b4ea77 100644 --- a/llvm/test/Analysis/MemorySSA/pr43317.ll +++ b/llvm/test/Analysis/MemorySSA/pr43317.ll @@ -1,4 +1,4 @@ -; RUN: opt -disable-output -licm -enable-new-pm=0 -print-memoryssa -enable-mssa-loop-dependency=true < %s 2>&1 | FileCheck %s +; RUN: opt -disable-output -licm -enable-new-pm=0 -print-memoryssa < %s 2>&1 | FileCheck %s ; RUN: opt -disable-output -passes='loop-mssa(licm),print' < %s 2>&1 | FileCheck %s @v_274 = external dso_local global i64, align 1 @v_295 = external dso_local global i16, align 1 diff --git a/llvm/test/Analysis/MemorySSA/pr43320.ll b/llvm/test/Analysis/MemorySSA/pr43320.ll index 6aca3f9eeb148..0527469c1da7d 100644 --- a/llvm/test/Analysis/MemorySSA/pr43320.ll +++ b/llvm/test/Analysis/MemorySSA/pr43320.ll @@ -1,4 +1,4 @@ -; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa -S < %s | FileCheck %s +; RUN: opt -licm -verify-memoryssa -S < %s | FileCheck %s ; REQUIRES: asserts target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Analysis/MemorySSA/pr43426.ll b/llvm/test/Analysis/MemorySSA/pr43426.ll index f603c32cf8da6..3e0dd5bafc47b 100644 --- a/llvm/test/Analysis/MemorySSA/pr43426.ll +++ b/llvm/test/Analysis/MemorySSA/pr43426.ll @@ -1,4 +1,4 @@ -; RUN: opt -licm -enable-mssa-loop-dependency -S %s | FileCheck %s +; RUN: opt -licm -S %s | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Analysis/MemorySSA/pr43427.ll b/llvm/test/Analysis/MemorySSA/pr43427.ll index 00a015c98e8fd..ffd5775685204 100644 --- a/llvm/test/Analysis/MemorySSA/pr43427.ll +++ b/llvm/test/Analysis/MemorySSA/pr43427.ll @@ -1,4 +1,4 @@ -; RUN: opt -disable-output -licm -enable-new-pm=0 -print-memoryssa -enable-mssa-loop-dependency=true < %s 2>&1 | FileCheck %s +; RUN: opt -disable-output -licm -enable-new-pm=0 -print-memoryssa < %s 2>&1 | FileCheck %s ; RUN: opt -disable-output -aa-pipeline=basic-aa -passes='loop-mssa(licm),print' < %s 2>&1 | FileCheck %s ; CHECK-LABEL: @f() diff --git a/llvm/test/Analysis/MemorySSA/pr43438.ll b/llvm/test/Analysis/MemorySSA/pr43438.ll index e15ab0e93c3c9..1f62b72399285 100644 --- a/llvm/test/Analysis/MemorySSA/pr43438.ll +++ b/llvm/test/Analysis/MemorySSA/pr43438.ll @@ -1,4 +1,4 @@ -; RUN: opt -disable-output -loop-simplify -licm -enable-new-pm=0 -print-memoryssa -enable-mssa-loop-dependency=true < %s 2>&1 | FileCheck %s +; RUN: opt -disable-output -loop-simplify -licm -enable-new-pm=0 -print-memoryssa < %s 2>&1 | FileCheck %s ; RUN: opt -disable-output -aa-pipeline=basic-aa -passes='loop-mssa(licm),print' < %s 2>&1 | FileCheck %s target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Analysis/MemorySSA/pr43493.ll b/llvm/test/Analysis/MemorySSA/pr43493.ll index 69506e8c0434b..68b0d3e493293 100644 --- a/llvm/test/Analysis/MemorySSA/pr43493.ll +++ b/llvm/test/Analysis/MemorySSA/pr43493.ll @@ -1,4 +1,4 @@ -; RUN: opt -enable-mssa-loop-dependency=true -loop-rotate -verify-memoryssa -S %s | FileCheck %s +; RUN: opt -loop-rotate -verify-memoryssa -S %s | FileCheck %s ; REQUIRES: asserts ; CHECK-LABEL: @func_35() diff --git a/llvm/test/Analysis/MemorySSA/pr43540.ll b/llvm/test/Analysis/MemorySSA/pr43540.ll index 325e6bc0ae8f7..a75b6d64be0db 100644 --- a/llvm/test/Analysis/MemorySSA/pr43540.ll +++ b/llvm/test/Analysis/MemorySSA/pr43540.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -licm -enable-mssa-loop-dependency=true %s | FileCheck %s +; RUN: opt -S -licm %s | FileCheck %s @v_1 = global i8 0, align 1 @v_2 = global i8 0, align 1 diff --git a/llvm/test/Analysis/MemorySSA/pr43541.ll b/llvm/test/Analysis/MemorySSA/pr43541.ll index 3f6b2e26bce86..9cc9b6fe5ff2e 100644 --- a/llvm/test/Analysis/MemorySSA/pr43541.ll +++ b/llvm/test/Analysis/MemorySSA/pr43541.ll @@ -1,4 +1,4 @@ -; RUN: opt -gvn-hoist -enable-mssa-loop-dependency -S < %s | FileCheck %s +; RUN: opt -gvn-hoist -S < %s | FileCheck %s ; REQUIRES: asserts %struct.job_pool.6.7 = type { i32 } diff --git a/llvm/test/Analysis/MemorySSA/pr43641.ll b/llvm/test/Analysis/MemorySSA/pr43641.ll index 06a6b5255b3e1..5e0dc73a0fd30 100644 --- a/llvm/test/Analysis/MemorySSA/pr43641.ll +++ b/llvm/test/Analysis/MemorySSA/pr43641.ll @@ -1,4 +1,4 @@ -; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -enable-mssa-loop-dependency -verify-memoryssa -S < %s | FileCheck %s +; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -verify-memoryssa -S < %s | FileCheck %s ; REQUIRES: asserts ; CHECK-LABEL: @c diff --git a/llvm/test/Analysis/MemorySSA/renamephis.ll b/llvm/test/Analysis/MemorySSA/renamephis.ll index 7d240d823fa23..576f8e61e67c4 100644 --- a/llvm/test/Analysis/MemorySSA/renamephis.ll +++ b/llvm/test/Analysis/MemorySSA/renamephis.ll @@ -1,4 +1,4 @@ -; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa -S %s | FileCheck %s +; RUN: opt -licm -verify-memoryssa -S %s | FileCheck %s ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Analysis/MemorySSA/unreachable.ll b/llvm/test/Analysis/MemorySSA/unreachable.ll index 6a93643537343..c6de66c69b027 100644 --- a/llvm/test/Analysis/MemorySSA/unreachable.ll +++ b/llvm/test/Analysis/MemorySSA/unreachable.ll @@ -1,4 +1,4 @@ -; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa %s -S | FileCheck %s +; RUN: opt -licm -verify-memoryssa %s -S | FileCheck %s ; REQUIRES: asserts ; Ensure verification doesn't fail with unreachable blocks. diff --git a/llvm/test/Analysis/MemorySSA/update_unroll.ll b/llvm/test/Analysis/MemorySSA/update_unroll.ll index cdf5d186e8654..006b97ac56b2f 100644 --- a/llvm/test/Analysis/MemorySSA/update_unroll.ll +++ b/llvm/test/Analysis/MemorySSA/update_unroll.ll @@ -1,4 +1,4 @@ -; RUN: opt -enable-new-pm=0 -enable-mssa-loop-dependency -verify-memoryssa -loop-rotate -S %s | FileCheck %s +; RUN: opt -enable-new-pm=0 -verify-memoryssa -loop-rotate -S %s | FileCheck %s ; RUN: opt -verify-memoryssa -passes='loop-mssa(loop-rotate)' -S %s | FileCheck %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/PowerPC/pr35688.ll b/llvm/test/CodeGen/PowerPC/pr35688.ll index 2be1add6cdfce..fd1db8332ea6e 100644 --- a/llvm/test/CodeGen/PowerPC/pr35688.ll +++ b/llvm/test/CodeGen/PowerPC/pr35688.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -enable-mssa-loop-dependency=false -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s | \ ; RUN: FileCheck %s -; RUN: llc -enable-mssa-loop-dependency=true -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s | \ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s | \ ; RUN: FileCheck %s --check-prefix=MSSA ; Function Attrs: nounwind define void @ec_GFp_nistp256_points_mul() { diff --git a/llvm/test/Transforms/LICM/argmemonly-call.ll b/llvm/test/Transforms/LICM/argmemonly-call.ll index 4e505038c9b88..4098daf6e7206 100644 --- a/llvm/test/Transforms/LICM/argmemonly-call.ll +++ b/llvm/test/Transforms/LICM/argmemonly-call.ll @@ -1,5 +1,5 @@ ; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 -enable-mssa-loop-dependency=false %s -enable-new-pm=0 | FileCheck %s -; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 -enable-mssa-loop-dependency=true -verify-memoryssa %s -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2 +; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 -verify-memoryssa %s -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2 ; RUN: opt -licm -basic-aa -licm-n2-threshold=200 < %s -S -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2 ; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=0 -passes='require,require,require,require,loop(licm)' < %s -S | FileCheck %s diff --git a/llvm/test/Transforms/LICM/atomics.ll b/llvm/test/Transforms/LICM/atomics.ll index 1feb8932c337a..cfa177323490c 100644 --- a/llvm/test/Transforms/LICM/atomics.ll +++ b/llvm/test/Transforms/LICM/atomics.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -S -basic-aa -licm -enable-mssa-loop-dependency=false -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,AST %s -; RUN: opt < %s -S -basic-aa -licm -enable-mssa-loop-dependency=true -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,MSSA %s +; RUN: opt < %s -S -basic-aa -licm -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,MSSA %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,loop(licm)' < %s -S | FileCheck -check-prefixes=CHECK,AST %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,loop-mssa(licm)' < %s -S | FileCheck -check-prefixes=CHECK,MSSA %s diff --git a/llvm/test/Transforms/LICM/guards.ll b/llvm/test/Transforms/LICM/guards.ll index e4dad88e49982..df97cac7544de 100644 --- a/llvm/test/Transforms/LICM/guards.ll +++ b/llvm/test/Transforms/LICM/guards.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; REQUIRES: asserts ; RUN: opt -licm -basic-aa -enable-mssa-loop-dependency=false -ipt-expensive-asserts=true < %s -S | FileCheck %s -; RUN: opt -licm -basic-aa -enable-mssa-loop-dependency=true -ipt-expensive-asserts=true < %s -S | FileCheck %s --check-prefixes=CHECK,MSSA +; RUN: opt -licm -basic-aa -ipt-expensive-asserts=true < %s -S | FileCheck %s --check-prefixes=CHECK,MSSA ; RUN: opt -aa-pipeline=basic-aa -passes='require,require,require,require,loop(licm)' -ipt-expensive-asserts=true < %s -S | FileCheck %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,require,require,require,loop-mssa(licm)' -ipt-expensive-asserts=true < %s -S | FileCheck %s --check-prefixes=CHECK,MSSA diff --git a/llvm/test/Transforms/LICM/hoist-bitcast-load.ll b/llvm/test/Transforms/LICM/hoist-bitcast-load.ll index d96b75bfbba75..95c5b365cecd5 100644 --- a/llvm/test/Transforms/LICM/hoist-bitcast-load.ll +++ b/llvm/test/Transforms/LICM/hoist-bitcast-load.ll @@ -1,6 +1,5 @@ -; RUN: opt -S -basic-aa -licm < %s | FileCheck %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,loop(loop-simplifycfg,licm)' -S < %s | FileCheck %s -; RUN: opt -S -basic-aa -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -basic-aa -licm -verify-memoryssa < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LICM/hoist-debuginvariant.ll b/llvm/test/Transforms/LICM/hoist-debuginvariant.ll index 6f851fb10b887..0bfaf232a3e9b 100644 --- a/llvm/test/Transforms/LICM/hoist-debuginvariant.ll +++ b/llvm/test/Transforms/LICM/hoist-debuginvariant.ll @@ -1,6 +1,5 @@ -; RUN: opt < %s -licm -S | FileCheck %s ; RUN: opt < %s -strip-debug -licm -S | FileCheck %s -; RUN: opt < %s -licm -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s +; RUN: opt < %s -licm -verify-memoryssa -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LICM/hoist-deref-load.ll b/llvm/test/Transforms/LICM/hoist-deref-load.ll index b13be8c9fb9a5..eacc920ee098f 100644 --- a/llvm/test/Transforms/LICM/hoist-deref-load.ll +++ b/llvm/test/Transforms/LICM/hoist-deref-load.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -basic-aa -licm < %s | FileCheck %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,loop(loop-simplifycfg,licm)' -S < %s | FileCheck %s -; RUN: opt -S -basic-aa -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -basic-aa -licm -verify-memoryssa < %s | FileCheck %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,loop-mssa(loop-simplifycfg,licm)' -verify-memoryssa -S < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/LICM/hoist-fast-fdiv.ll b/llvm/test/Transforms/LICM/hoist-fast-fdiv.ll index bdefcf9e8f462..c48cac4d954d4 100644 --- a/llvm/test/Transforms/LICM/hoist-fast-fdiv.ll +++ b/llvm/test/Transforms/LICM/hoist-fast-fdiv.ll @@ -1,5 +1,4 @@ -; RUN: opt -licm -S < %s | FileCheck %s -; RUN: opt -licm -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s +; RUN: opt -licm -verify-memoryssa -S < %s | FileCheck %s ; Function Attrs: noinline norecurse nounwind readnone ssp uwtable define zeroext i1 @invariant_denom(double %v) #0 { diff --git a/llvm/test/Transforms/LICM/hoist-invariant-load.ll b/llvm/test/Transforms/LICM/hoist-invariant-load.ll index f711d8017e98a..6562441d77c88 100644 --- a/llvm/test/Transforms/LICM/hoist-invariant-load.ll +++ b/llvm/test/Transforms/LICM/hoist-invariant-load.ll @@ -1,6 +1,5 @@ ; REQUIRES: asserts -; RUN: opt < %s -licm -disable-basic-aa -stats -S 2>&1 | grep "1 licm" -; RUN: opt < %s -licm -enable-mssa-loop-dependency=true -verify-memoryssa -disable-basic-aa -stats -S 2>&1 | grep "1 licm" +; RUN: opt < %s -licm -verify-memoryssa -disable-basic-aa -stats -S 2>&1 | grep "1 licm" @"\01L_OBJC_METH_VAR_NAME_" = internal global [4 x i8] c"foo\00", section "__TEXT,__objc_methname,cstring_literals", align 1 @"\01L_OBJC_SELECTOR_REFERENCES_" = internal global i8* getelementptr inbounds ([4 x i8], [4 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip" diff --git a/llvm/test/Transforms/LICM/hoist-nounwind.ll b/llvm/test/Transforms/LICM/hoist-nounwind.ll index e74bf59bc36d1..a5f54c25aa57f 100644 --- a/llvm/test/Transforms/LICM/hoist-nounwind.ll +++ b/llvm/test/Transforms/LICM/hoist-nounwind.ll @@ -1,6 +1,5 @@ -; RUN: opt -S -basic-aa -licm < %s | FileCheck %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,loop(licm)' -S %s | FileCheck %s -; RUN: opt -S -basic-aa -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -basic-aa -licm -verify-memoryssa < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -95,4 +94,4 @@ for.body: for.cond.cleanup: ret i32 %add -} \ No newline at end of file +} diff --git a/llvm/test/Transforms/LICM/hoist-round.ll b/llvm/test/Transforms/LICM/hoist-round.ll index f76919eac8182..2c2c92eb28868 100644 --- a/llvm/test/Transforms/LICM/hoist-round.ll +++ b/llvm/test/Transforms/LICM/hoist-round.ll @@ -1,6 +1,5 @@ -; RUN: opt -S -licm < %s | FileCheck %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,require,require,require,loop(licm)' -S %s | FileCheck %s -; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -licm -verify-memoryssa < %s | FileCheck %s target datalayout = "E-m:e-p:32:32-i8:8:8-i16:16:16-i64:32:32-f64:32:32-v64:32:32-v128:32:32-a0:0:32-n32" diff --git a/llvm/test/Transforms/LICM/hoisting.ll b/llvm/test/Transforms/LICM/hoisting.ll index 00ac0f5756dea..580137ce2bf60 100644 --- a/llvm/test/Transforms/LICM/hoisting.ll +++ b/llvm/test/Transforms/LICM/hoisting.ll @@ -1,6 +1,5 @@ -; RUN: opt < %s -licm -S | FileCheck %s ; RUN: opt < %s -aa-pipeline=basic-aa -passes='require,loop(licm)' -S | FileCheck %s -; RUN: opt < %s -licm -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s +; RUN: opt < %s -licm -verify-memoryssa -S | FileCheck %s @X = global i32 0 ; [#uses=1] diff --git a/llvm/test/Transforms/LICM/pr40317.ll b/llvm/test/Transforms/LICM/pr40317.ll index 487cbb23eb0b1..82c791f3bde68 100644 --- a/llvm/test/Transforms/LICM/pr40317.ll +++ b/llvm/test/Transforms/LICM/pr40317.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mcpu=z13 -tbaa -licm -enable-mssa-loop-dependency -licm-control-flow-hoisting -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -mcpu=z13 -tbaa -licm -licm-control-flow-hoisting -verify-memoryssa < %s | FileCheck %s target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" target triple = "s390x-ibm-linux" diff --git a/llvm/test/Transforms/LICM/pr42969.ll b/llvm/test/Transforms/LICM/pr42969.ll index 3cb82dc8095a6..428e64b9a6408 100644 --- a/llvm/test/Transforms/LICM/pr42969.ll +++ b/llvm/test/Transforms/LICM/pr42969.ll @@ -1,4 +1,4 @@ -; RUN: opt %s -S -scoped-noalias-aa -enable-mssa-loop-dependency=true -licm | FileCheck %s +; RUN: opt %s -S -scoped-noalias-aa -licm | FileCheck %s define i16 @main(i1 %a_b_mayalias, i16* %a, i16* %b) { ; CHECK: scalar.body: diff --git a/llvm/test/Transforms/LICM/promote-order.ll b/llvm/test/Transforms/LICM/promote-order.ll index 3e59ad1e2d0e5..197e94662ce81 100644 --- a/llvm/test/Transforms/LICM/promote-order.ll +++ b/llvm/test/Transforms/LICM/promote-order.ll @@ -1,5 +1,5 @@ ; RUN: opt -tbaa -basic-aa -licm -enable-mssa-loop-dependency=false -enable-new-pm=0 -S < %s | FileCheck %s --check-prefixes=CHECK,AST -; RUN: opt -tbaa -basic-aa -licm -enable-mssa-loop-dependency=true -enable-new-pm=0 -S < %s | FileCheck %s --check-prefixes=CHECK,MSSA +; RUN: opt -tbaa -basic-aa -licm -enable-new-pm=0 -S < %s | FileCheck %s --check-prefixes=CHECK,MSSA ; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require,require,require,require,loop(licm)' -S %s | FileCheck %s --check-prefixes=CHECK,AST ; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require,require,require,require,loop-mssa(licm)' -S %s | FileCheck %s --check-prefixes=CHECK,MSSA diff --git a/llvm/test/Transforms/LICM/read-only-calls.ll b/llvm/test/Transforms/LICM/read-only-calls.ll index 0177c8e43204d..3fa242abab291 100644 --- a/llvm/test/Transforms/LICM/read-only-calls.ll +++ b/llvm/test/Transforms/LICM/read-only-calls.ll @@ -1,5 +1,5 @@ ; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 -enable-mssa-loop-dependency=false %s -enable-new-pm=0 | FileCheck %s -; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 -enable-mssa-loop-dependency=true %s -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2 +; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 %s -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2 ; RUN: opt -licm -basic-aa -licm-n2-threshold=200 < %s -S -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2 ; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=0 -passes='require,require,require,require,loop(licm)' < %s -S | FileCheck %s diff --git a/llvm/test/Transforms/LICM/sink.ll b/llvm/test/Transforms/LICM/sink.ll index 8a5da47847c86..a193098596074 100644 --- a/llvm/test/Transforms/LICM/sink.ll +++ b/llvm/test/Transforms/LICM/sink.ll @@ -1,10 +1,9 @@ ; RUN: opt -S -licm -licm-coldness-threshold=0 < %s | FileCheck %s --check-prefix=CHECK-LICM -; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM ; RUN: opt -S -licm < %s | opt -S -loop-sink | FileCheck %s --check-prefix=CHECK-SINK ; RUN: opt -S < %s -passes='require,loop(licm),loop-sink' \ ; RUN: | FileCheck %s --check-prefix=CHECK-SINK -; RUN: opt -S -licm -licm-coldness-threshold=0 -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM -; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM +; RUN: opt -S -licm -licm-coldness-threshold=0 -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM +; RUN: opt -S -licm -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM ; Original source code: ; int g; diff --git a/llvm/test/Transforms/LICM/sinking.ll b/llvm/test/Transforms/LICM/sinking.ll index e8660695aa79f..63d97afe150ba 100644 --- a/llvm/test/Transforms/LICM/sinking.ll +++ b/llvm/test/Transforms/LICM/sinking.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -basic-aa -licm -S | FileCheck %s -; RUN: opt < %s -basic-aa -licm -S -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s +; RUN: opt < %s -basic-aa -licm -S -verify-memoryssa | FileCheck %s declare i32 @strlen(i8*) readonly nounwind willreturn diff --git a/llvm/test/Transforms/LICM/store-hoisting.ll b/llvm/test/Transforms/LICM/store-hoisting.ll index f8e9616d4c82e..7795c441b7e17 100644 --- a/llvm/test/Transforms/LICM/store-hoisting.ll +++ b/llvm/test/Transforms/LICM/store-hoisting.ll @@ -1,5 +1,5 @@ ; RUN: opt -S -basic-aa -licm -enable-mssa-loop-dependency=false %s -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,AST %s -; RUN: opt -S -basic-aa -licm -enable-mssa-loop-dependency=true %s -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,MSSA %s +; RUN: opt -S -basic-aa -licm %s -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,MSSA %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,require,require,require,loop(licm)' < %s -S | FileCheck -check-prefixes=CHECK,AST %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,require,require,require,loop-mssa(licm)' < %s -S | FileCheck -check-prefixes=CHECK,MSSA %s diff --git a/llvm/test/Transforms/LICM/volatile-alias.ll b/llvm/test/Transforms/LICM/volatile-alias.ll index be310e398c000..7ff2e51f22688 100644 --- a/llvm/test/Transforms/LICM/volatile-alias.ll +++ b/llvm/test/Transforms/LICM/volatile-alias.ll @@ -1,6 +1,5 @@ -; RUN: opt -basic-aa -sroa -loop-rotate -licm -S < %s | FileCheck %s ; RUN: opt -basic-aa -sroa -loop-rotate %s | opt -aa-pipeline=basic-aa -passes='require,require,require,require,loop(licm)' -S | FileCheck %s -; RUN: opt -basic-aa -sroa -loop-rotate -licm -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s +; RUN: opt -basic-aa -sroa -loop-rotate -licm -verify-memoryssa -S < %s | FileCheck %s ; The objects *p and *q are aliased to each other, but even though *q is ; volatile, *p can be considered invariant in the loop. Check if it is moved ; out of the loop. diff --git a/llvm/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll b/llvm/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll index a09a2290e0a54..b7be94bf9669e 100644 --- a/llvm/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll +++ b/llvm/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output -; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -verify-memoryssa -disable-output ; PR3408 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopRotate/PhiRename-1.ll b/llvm/test/Transforms/LoopRotate/PhiRename-1.ll index 8bece445cf463..147c08c49c99d 100644 --- a/llvm/test/Transforms/LoopRotate/PhiRename-1.ll +++ b/llvm/test/Transforms/LoopRotate/PhiRename-1.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -S | FileCheck %s -; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s +; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -verify-memoryssa -S | FileCheck %s ; CHECK-NOT: [ {{.}}tmp224 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64" diff --git a/llvm/test/Transforms/LoopRotate/PhiSelfReference-1.ll b/llvm/test/Transforms/LoopRotate/PhiSelfReference-1.ll index 7726c53e55eee..c97d794811526 100644 --- a/llvm/test/Transforms/LoopRotate/PhiSelfReference-1.ll +++ b/llvm/test/Transforms/LoopRotate/PhiSelfReference-1.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output -; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -verify-memoryssa -disable-output ; ModuleID = 'PhiSelfReference-1.bc' define void @snrm2(i32 %incx) { diff --git a/llvm/test/Transforms/LoopRotate/alloca.ll b/llvm/test/Transforms/LoopRotate/alloca.ll index 59da33f8802ad..5014258010adc 100644 --- a/llvm/test/Transforms/LoopRotate/alloca.ll +++ b/llvm/test/Transforms/LoopRotate/alloca.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -loop-rotate -S | FileCheck %s -; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s +; RUN: opt < %s -loop-rotate -verify-memoryssa -S | FileCheck %s ; Test alloca in -loop-rotate. diff --git a/llvm/test/Transforms/LoopRotate/basic.ll b/llvm/test/Transforms/LoopRotate/basic.ll index 440cc210643af..5f48d66e6b32d 100644 --- a/llvm/test/Transforms/LoopRotate/basic.ll +++ b/llvm/test/Transforms/LoopRotate/basic.ll @@ -1,5 +1,4 @@ -; RUN: opt -S -loop-rotate < %s | FileCheck %s -; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s ; RUN: opt -S -passes='require,require,loop(loop-rotate)' < %s | FileCheck %s ; RUN: opt -S -passes='require,require,loop-mssa(loop-rotate)' -verify-memoryssa < %s | FileCheck %s diff --git a/llvm/test/Transforms/LoopRotate/callbr.ll b/llvm/test/Transforms/LoopRotate/callbr.ll index 6eed2eb17dc3f..3883bc4af5b7b 100644 --- a/llvm/test/Transforms/LoopRotate/callbr.ll +++ b/llvm/test/Transforms/LoopRotate/callbr.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info | FileCheck %s -; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true | FileCheck %s @d = external global i64, align 8 @f = external global i32, align 4 diff --git a/llvm/test/Transforms/LoopRotate/catchret.ll b/llvm/test/Transforms/LoopRotate/catchret.ll index f28af8aed601c..91c1554ab8ce4 100755 --- a/llvm/test/Transforms/LoopRotate/catchret.ll +++ b/llvm/test/Transforms/LoopRotate/catchret.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-rotate -S | FileCheck %s -; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s +; RUN: opt < %s -loop-rotate -verify-memoryssa -S | FileCheck %s target triple = "x86_64-pc-windows-msvc" diff --git a/llvm/test/Transforms/LoopRotate/convergent.ll b/llvm/test/Transforms/LoopRotate/convergent.ll index 37671562142eb..98733246270ae 100644 --- a/llvm/test/Transforms/LoopRotate/convergent.ll +++ b/llvm/test/Transforms/LoopRotate/convergent.ll @@ -1,5 +1,4 @@ -; RUN: opt -S -loop-rotate < %s | FileCheck %s -; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s @e = global i32 10 diff --git a/llvm/test/Transforms/LoopRotate/crash.ll b/llvm/test/Transforms/LoopRotate/crash.ll index 2a45e370e18c9..0a1688666ac11 100644 --- a/llvm/test/Transforms/LoopRotate/crash.ll +++ b/llvm/test/Transforms/LoopRotate/crash.ll @@ -1,5 +1,4 @@ -; RUN: opt -loop-rotate -disable-output -verify-dom-info -verify-loop-info < %s -; RUN: opt -loop-rotate -disable-output -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa < %s +; RUN: opt -loop-rotate -disable-output -verify-dom-info -verify-loop-info -verify-memoryssa < %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-apple-darwin10.0.0" diff --git a/llvm/test/Transforms/LoopRotate/dbg-value-duplicates.ll b/llvm/test/Transforms/LoopRotate/dbg-value-duplicates.ll index ce7157c571f08..d55c024b9f56c 100644 --- a/llvm/test/Transforms/LoopRotate/dbg-value-duplicates.ll +++ b/llvm/test/Transforms/LoopRotate/dbg-value-duplicates.ll @@ -1,5 +1,4 @@ -; RUN: opt -S -loop-rotate < %s | FileCheck %s -; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s source_filename = "/tmp/loop.c" target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.13.0" diff --git a/llvm/test/Transforms/LoopRotate/dbgvalue.ll b/llvm/test/Transforms/LoopRotate/dbgvalue.ll index 93e3c4c252cae..4e872d255d7f8 100644 --- a/llvm/test/Transforms/LoopRotate/dbgvalue.ll +++ b/llvm/test/Transforms/LoopRotate/dbgvalue.ll @@ -1,5 +1,4 @@ -; RUN: opt -S -loop-rotate < %s | FileCheck %s -; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone diff --git a/llvm/test/Transforms/LoopRotate/indirectbr.ll b/llvm/test/Transforms/LoopRotate/indirectbr.ll index a26ec375953d2..3abf0ba00de81 100644 --- a/llvm/test/Transforms/LoopRotate/indirectbr.ll +++ b/llvm/test/Transforms/LoopRotate/indirectbr.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info | FileCheck %s -; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s +; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info -verify-memoryssa | FileCheck %s ; PR5502 define void @z80_do_opcodes() nounwind { diff --git a/llvm/test/Transforms/LoopRotate/loopexitinglatch.ll b/llvm/test/Transforms/LoopRotate/loopexitinglatch.ll index dee29ec958e19..a8f7c0d878cb6 100644 --- a/llvm/test/Transforms/LoopRotate/loopexitinglatch.ll +++ b/llvm/test/Transforms/LoopRotate/loopexitinglatch.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s -; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s +; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info -verify-memoryssa | FileCheck %s target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8m.base-arm-none-eabi" diff --git a/llvm/test/Transforms/LoopRotate/multiple-exits.ll b/llvm/test/Transforms/LoopRotate/multiple-exits.ll index c6f153b8ca3ec..1a1ab3f32af36 100644 --- a/llvm/test/Transforms/LoopRotate/multiple-exits.ll +++ b/llvm/test/Transforms/LoopRotate/multiple-exits.ll @@ -1,5 +1,4 @@ -; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s -; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s +; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info -verify-memoryssa | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" diff --git a/llvm/test/Transforms/LoopRotate/noalias.ll b/llvm/test/Transforms/LoopRotate/noalias.ll index 9f2dbb85bf969..09b5c5fd306d2 100644 --- a/llvm/test/Transforms/LoopRotate/noalias.ll +++ b/llvm/test/Transforms/LoopRotate/noalias.ll @@ -1,7 +1,6 @@ -; RUN: opt -S -loop-rotate < %s | FileCheck %s -; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s ; RUN: opt -S -passes='require,require,loop(loop-rotate)' < %s | FileCheck %s -; RUN: opt -S -passes='require,require,loop(loop-rotate)' -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -passes='require,require,loop(loop-rotate)' -verify-memoryssa < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopRotate/phi-dbgvalue.ll b/llvm/test/Transforms/LoopRotate/phi-dbgvalue.ll index c4e13dc710e7d..fb54cb8042e13 100644 --- a/llvm/test/Transforms/LoopRotate/phi-dbgvalue.ll +++ b/llvm/test/Transforms/LoopRotate/phi-dbgvalue.ll @@ -1,5 +1,4 @@ -; RUN: opt -S -loop-rotate < %s | FileCheck %s -; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s ;CHECK-LABEL: func ;CHECK-LABEL: entry diff --git a/llvm/test/Transforms/LoopRotate/phi-duplicate.ll b/llvm/test/Transforms/LoopRotate/phi-duplicate.ll index d7f69d8c9cc3b..292c595aefc1e 100644 --- a/llvm/test/Transforms/LoopRotate/phi-duplicate.ll +++ b/llvm/test/Transforms/LoopRotate/phi-duplicate.ll @@ -1,5 +1,5 @@ ; RUN: opt -S -loop-rotate < %s | FileCheck %s -; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-apple-darwin10.0" diff --git a/llvm/test/Transforms/LoopRotate/pr22337.ll b/llvm/test/Transforms/LoopRotate/pr22337.ll index 8195affbcd3b2..bd8659b39fd2f 100644 --- a/llvm/test/Transforms/LoopRotate/pr22337.ll +++ b/llvm/test/Transforms/LoopRotate/pr22337.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-rotate -S | FileCheck %s -; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s +; RUN: opt < %s -loop-rotate -verify-memoryssa -S | FileCheck %s @a = external global i8, align 4 @tmp = global i8* @a diff --git a/llvm/test/Transforms/LoopRotate/pr33701.ll b/llvm/test/Transforms/LoopRotate/pr33701.ll index 8535e31767619..91ee2e84ec175 100644 --- a/llvm/test/Transforms/LoopRotate/pr33701.ll +++ b/llvm/test/Transforms/LoopRotate/pr33701.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output -; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -verify-memoryssa -disable-output define void @func() { bb0: diff --git a/llvm/test/Transforms/LoopRotate/pr37205.ll b/llvm/test/Transforms/LoopRotate/pr37205.ll index 20ad756818984..161016a2a08c3 100644 --- a/llvm/test/Transforms/LoopRotate/pr37205.ll +++ b/llvm/test/Transforms/LoopRotate/pr37205.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -indvars -verify -loop-rotate -loop-idiom < %s | FileCheck %s -; RUN: opt -S -indvars -verify -loop-rotate -loop-idiom -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -indvars -verify -loop-rotate -loop-idiom -verify-memoryssa < %s | FileCheck %s target triple = "x86_64-unknown-linux-gnu" ; Verify that we invalidate SCEV properly. diff --git a/llvm/test/Transforms/LoopRotate/preserve-mssa.ll b/llvm/test/Transforms/LoopRotate/preserve-mssa.ll index d975f80cd9e47..542bb7cb00ca3 100644 --- a/llvm/test/Transforms/LoopRotate/preserve-mssa.ll +++ b/llvm/test/Transforms/LoopRotate/preserve-mssa.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s ; CHECK-LABEL: @multiedge( define void @multiedge() { diff --git a/llvm/test/Transforms/LoopRotate/preserve-scev.ll b/llvm/test/Transforms/LoopRotate/preserve-scev.ll index 2faf8ec487aaa..c35572ab8ccd1 100644 --- a/llvm/test/Transforms/LoopRotate/preserve-scev.ll +++ b/llvm/test/Transforms/LoopRotate/preserve-scev.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-rotate -loop-reduce -verify-dom-info -verify-loop-info -disable-output -; RUN: opt < %s -loop-rotate -loop-reduce -enable-mssa-loop-dependency=true -verify-memoryssa -verify-dom-info -verify-loop-info -disable-output +; RUN: opt < %s -loop-rotate -loop-reduce -verify-memoryssa -verify-dom-info -verify-loop-info -disable-output define fastcc void @foo(i32* %A, i64 %i) nounwind { BB: diff --git a/llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll b/llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll index c4c987e7b2baf..767d9093ad415 100644 --- a/llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll +++ b/llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-rotate -S | FileCheck %s -; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s +; RUN: opt < %s -loop-rotate -verify-memoryssa -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll index 24e443a5ca869..3ab19b22c9c8e 100644 --- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll +++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll @@ -2,7 +2,7 @@ ; REQUIRES: asserts ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s +; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" diff --git a/llvm/test/Transforms/LoopSimplifyCFG/irreducible_cfg.ll b/llvm/test/Transforms/LoopSimplifyCFG/irreducible_cfg.ll index 201c5f10b5196..1a2368796a3b4 100644 --- a/llvm/test/Transforms/LoopSimplifyCFG/irreducible_cfg.ll +++ b/llvm/test/Transforms/LoopSimplifyCFG/irreducible_cfg.ll @@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; REQUIRES: asserts -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require,loop(loop-simplifycfg)' -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s +; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-memoryssa -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" diff --git a/llvm/test/Transforms/LoopSimplifyCFG/lcssa.ll b/llvm/test/Transforms/LoopSimplifyCFG/lcssa.ll index 84bea41af1dbf..326f474ac05b9 100644 --- a/llvm/test/Transforms/LoopSimplifyCFG/lcssa.ll +++ b/llvm/test/Transforms/LoopSimplifyCFG/lcssa.ll @@ -1,7 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s +; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Transforms/LoopSimplifyCFG/live_block_marking.ll b/llvm/test/Transforms/LoopSimplifyCFG/live_block_marking.ll index 0f290282cb2d4..96364cf5fe34a 100644 --- a/llvm/test/Transforms/LoopSimplifyCFG/live_block_marking.ll +++ b/llvm/test/Transforms/LoopSimplifyCFG/live_block_marking.ll @@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; REQUIRES: asserts -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -indvars -loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require,loop(indvars,loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -indvars -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s +; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -indvars -loop-simplifycfg -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s define void @test(i1 %c) { ; CHECK-LABEL: @test( diff --git a/llvm/test/Transforms/LoopSimplifyCFG/merge-header.ll b/llvm/test/Transforms/LoopSimplifyCFG/merge-header.ll index 8db5de7f851de..dec27d91e2623 100644 --- a/llvm/test/Transforms/LoopSimplifyCFG/merge-header.ll +++ b/llvm/test/Transforms/LoopSimplifyCFG/merge-header.ll @@ -1,6 +1,5 @@ -; RUN: opt -S -loop-simplifycfg < %s | FileCheck %s ; RUN: opt -S -passes='require,loop(loop-simplifycfg)' < %s | FileCheck %s -; RUN: opt -S -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-simplifycfg -verify-memoryssa < %s | FileCheck %s ; CHECK-LABEL: foo ; CHECK: entry: diff --git a/llvm/test/Transforms/LoopSimplifyCFG/mssa_update.ll b/llvm/test/Transforms/LoopSimplifyCFG/mssa_update.ll index 5a3ae453610dc..ab5442c066e07 100644 --- a/llvm/test/Transforms/LoopSimplifyCFG/mssa_update.ll +++ b/llvm/test/Transforms/LoopSimplifyCFG/mssa_update.ll @@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; REQUIRES: asserts -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s +; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Transforms/LoopSimplifyCFG/phi_with_duplicating_inputs.ll b/llvm/test/Transforms/LoopSimplifyCFG/phi_with_duplicating_inputs.ll index ffd298032222b..eeadd60d04868 100644 --- a/llvm/test/Transforms/LoopSimplifyCFG/phi_with_duplicating_inputs.ll +++ b/llvm/test/Transforms/LoopSimplifyCFG/phi_with_duplicating_inputs.ll @@ -2,9 +2,8 @@ ; This is currently failing because of bug in LoopSimplifyCFG. It does not update ; duplicating Phi inputs properly. ; REQUIRES: asserts -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require,loop(loop-simplifycfg)' -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s +; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-memoryssa -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s target datalayout = "P40" diff --git a/llvm/test/Transforms/LoopSimplifyCFG/pr39783.ll b/llvm/test/Transforms/LoopSimplifyCFG/pr39783.ll index 570d57cde2dc9..f06fab7fe0dc2 100644 --- a/llvm/test/Transforms/LoopSimplifyCFG/pr39783.ll +++ b/llvm/test/Transforms/LoopSimplifyCFG/pr39783.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -mcpu=z13 -S -loop-simplifycfg -enable-mssa-loop-dependency -enable-loop-simplifycfg-term-folding -verify-memoryssa 2>&1 < %s | FileCheck %s +; RUN: opt -mcpu=z13 -S -loop-simplifycfg -enable-loop-simplifycfg-term-folding -verify-memoryssa 2>&1 < %s | FileCheck %s target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" @global = external dso_local local_unnamed_addr global i8, align 2 diff --git a/llvm/test/Transforms/LoopSimplifyCFG/scev.ll b/llvm/test/Transforms/LoopSimplifyCFG/scev.ll index 123c7e6d4a869..d66e5261b769c 100644 --- a/llvm/test/Transforms/LoopSimplifyCFG/scev.ll +++ b/llvm/test/Transforms/LoopSimplifyCFG/scev.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -loop-simplifycfg -verify-scev < %s | FileCheck %s -; RUN: opt -S -loop-simplifycfg -verify-scev -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-simplifycfg -verify-scev -verify-memoryssa < %s | FileCheck %s ; Verify that the scev information is still valid. Verification should not fail diff --git a/llvm/test/Transforms/LoopSimplifyCFG/update_parents.ll b/llvm/test/Transforms/LoopSimplifyCFG/update_parents.ll index f1104a0f96f12..c120cae54cb72 100644 --- a/llvm/test/Transforms/LoopSimplifyCFG/update_parents.ll +++ b/llvm/test/Transforms/LoopSimplifyCFG/update_parents.ll @@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; REQUIRES: asserts -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s -; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s +; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopUnswitch/2006-06-13-SingleEntryPHI.ll b/llvm/test/Transforms/LoopUnswitch/2006-06-13-SingleEntryPHI.ll index cbe5d6b694480..189f4799f46f5 100644 --- a/llvm/test/Transforms/LoopUnswitch/2006-06-13-SingleEntryPHI.ll +++ b/llvm/test/Transforms/LoopUnswitch/2006-06-13-SingleEntryPHI.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output %struct.BLEND_MAP = type { i16, i16, i16, i32, %struct.BLEND_MAP_ENTRY* } %struct.BLEND_MAP_ENTRY = type { float, i8, { [5 x float], [4 x i8] } } diff --git a/llvm/test/Transforms/LoopUnswitch/2006-06-27-DeadSwitchCase.ll b/llvm/test/Transforms/LoopUnswitch/2006-06-27-DeadSwitchCase.ll index 3f17b36b1a568..bbd1bac1fc418 100644 --- a/llvm/test/Transforms/LoopUnswitch/2006-06-27-DeadSwitchCase.ll +++ b/llvm/test/Transforms/LoopUnswitch/2006-06-27-DeadSwitchCase.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output define void @init_caller_save() { entry: diff --git a/llvm/test/Transforms/LoopUnswitch/2007-05-09-Unreachable.ll b/llvm/test/Transforms/LoopUnswitch/2007-05-09-Unreachable.ll index 8f8cbc8dc6f10..660975cb4be44 100644 --- a/llvm/test/Transforms/LoopUnswitch/2007-05-09-Unreachable.ll +++ b/llvm/test/Transforms/LoopUnswitch/2007-05-09-Unreachable.ll @@ -1,6 +1,6 @@ ; PR1333 ; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64" target triple = "i686-pc-linux-gnu" diff --git a/llvm/test/Transforms/LoopUnswitch/2007-05-09-tl.ll b/llvm/test/Transforms/LoopUnswitch/2007-05-09-tl.ll index bb9f07fbe0eef..237c92400eaf2 100644 --- a/llvm/test/Transforms/LoopUnswitch/2007-05-09-tl.ll +++ b/llvm/test/Transforms/LoopUnswitch/2007-05-09-tl.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output ; PR1333 define void @pp_cxx_expression() { diff --git a/llvm/test/Transforms/LoopUnswitch/2007-07-12-ExitDomInfo.ll b/llvm/test/Transforms/LoopUnswitch/2007-07-12-ExitDomInfo.ll index 673ac037ae3ed..d964beb589710 100644 --- a/llvm/test/Transforms/LoopUnswitch/2007-07-12-ExitDomInfo.ll +++ b/llvm/test/Transforms/LoopUnswitch/2007-07-12-ExitDomInfo.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -instcombine -disable-output -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -disable-output +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -instcombine -disable-output @str3 = external constant [3 x i8] ; <[3 x i8]*> [#uses=1] diff --git a/llvm/test/Transforms/LoopUnswitch/2007-07-13-DomInfo.ll b/llvm/test/Transforms/LoopUnswitch/2007-07-13-DomInfo.ll index 0ccb353a1573c..c282ed3c55a87 100644 --- a/llvm/test/Transforms/LoopUnswitch/2007-07-13-DomInfo.ll +++ b/llvm/test/Transforms/LoopUnswitch/2007-07-13-DomInfo.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output define i32 @main(i32 %argc, i8** %argv) { entry: diff --git a/llvm/test/Transforms/LoopUnswitch/2007-07-18-DomInfo.ll b/llvm/test/Transforms/LoopUnswitch/2007-07-18-DomInfo.ll index 80a3dc6a0a056..ab986d257aa59 100644 --- a/llvm/test/Transforms/LoopUnswitch/2007-07-18-DomInfo.ll +++ b/llvm/test/Transforms/LoopUnswitch/2007-07-18-DomInfo.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output ; PR1559 target triple = "i686-pc-linux-gnu" diff --git a/llvm/test/Transforms/LoopUnswitch/2007-08-01-LCSSA.ll b/llvm/test/Transforms/LoopUnswitch/2007-08-01-LCSSA.ll index 64508a63715f0..53d1f63df4a28 100644 --- a/llvm/test/Transforms/LoopUnswitch/2007-08-01-LCSSA.ll +++ b/llvm/test/Transforms/LoopUnswitch/2007-08-01-LCSSA.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -instcombine -disable-output -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -disable-output +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -instcombine -disable-output %struct.ClassDef = type { %struct.QByteArray, %struct.QByteArray, %"struct.QList", %"struct.QList", i8, i8, %"struct.QList", %"struct.QList", %"struct.QList", %"struct.QList", %"struct.QList", %"struct.QList", %"struct.QMap", %"struct.QList", %"struct.QMap", i32, i32 } %struct.FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct.FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] } %struct.Generator = type { %struct.FILE*, %struct.ClassDef*, %"struct.QList", %struct.QByteArray, %"struct.QList" } diff --git a/llvm/test/Transforms/LoopUnswitch/2008-06-02-DomInfo.ll b/llvm/test/Transforms/LoopUnswitch/2008-06-02-DomInfo.ll index 4c8230d29306c..ff6207de80a91 100644 --- a/llvm/test/Transforms/LoopUnswitch/2008-06-02-DomInfo.ll +++ b/llvm/test/Transforms/LoopUnswitch/2008-06-02-DomInfo.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -instcombine -gvn -disable-output -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -instcombine -gvn -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -instcombine -gvn -verify-memoryssa -disable-output ; PR2372 target triple = "i386-pc-linux-gnu" diff --git a/llvm/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll b/llvm/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll index 8491bdb2acf12..a464bf4c5bacb 100644 --- a/llvm/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll +++ b/llvm/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll @@ -1,6 +1,5 @@ ; REQUIRES: asserts -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -stats -disable-output 2>&1 | FileCheck %s -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -stats -disable-output 2>&1 | FileCheck %s +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -stats -disable-output 2>&1 | FileCheck %s ; PR 3170 define i32 @a(i32 %x, i32 %y) nounwind { diff --git a/llvm/test/Transforms/LoopUnswitch/2010-11-18-LCSSA.ll b/llvm/test/Transforms/LoopUnswitch/2010-11-18-LCSSA.ll index 8773b56b938ec..01adb6c3a3c70 100644 --- a/llvm/test/Transforms/LoopUnswitch/2010-11-18-LCSSA.ll +++ b/llvm/test/Transforms/LoopUnswitch/2010-11-18-LCSSA.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa ; PR8622 @g_38 = external global i32, align 4 diff --git a/llvm/test/Transforms/LoopUnswitch/2011-09-26-EHCrash.ll b/llvm/test/Transforms/LoopUnswitch/2011-09-26-EHCrash.ll index 8672c2d21c866..12f0c6f151662 100644 --- a/llvm/test/Transforms/LoopUnswitch/2011-09-26-EHCrash.ll +++ b/llvm/test/Transforms/LoopUnswitch/2011-09-26-EHCrash.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -sroa -loop-unswitch -enable-new-pm=0 -disable-output -; RUN: opt < %s -sroa -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -sroa -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output ; PR11016 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-apple-macosx10.7.2" diff --git a/llvm/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll b/llvm/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll index e8d66e91bd1fb..6dddfea72e3f1 100644 --- a/llvm/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll +++ b/llvm/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll @@ -1,7 +1,6 @@ ; REQUIRES: asserts ; RUN: opt -loop-unswitch -enable-new-pm=0 -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s -; RUN: opt -S -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info < %s | FileCheck %s -; RUN: opt -S -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info -verify-memoryssa < %s | FileCheck %s ; STATS: 2 loop-unswitch - Number of switches unswitched diff --git a/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll b/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll index ef4a611eba683..71ca7d1883a91 100644 --- a/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll +++ b/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll @@ -1,7 +1,6 @@ ; REQUIRES: asserts ; RUN: opt -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 13 -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s -; RUN: opt -S -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 13 -verify-loop-info -verify-dom-info < %s | FileCheck %s -; RUN: opt -S -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 13 -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 13 -verify-loop-info -verify-dom-info -verify-memoryssa < %s | FileCheck %s ; STATS: 1 loop-unswitch - Number of switches unswitched diff --git a/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll b/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll index 6cbff0ed9c210..0e070989bb70f 100644 --- a/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll +++ b/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll @@ -1,7 +1,6 @@ ; REQUIRES: asserts ; RUN: opt -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 1000 -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s -; RUN: opt -S -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 1000 -verify-loop-info -verify-dom-info < %s | FileCheck %s -; RUN: opt -S -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 1000 -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 1000 -verify-loop-info -verify-dom-info -verify-memoryssa < %s | FileCheck %s ; STATS: 3 loop-unswitch - Number of switches unswitched diff --git a/llvm/test/Transforms/LoopUnswitch/2012-04-02-IndirectBr.ll b/llvm/test/Transforms/LoopUnswitch/2012-04-02-IndirectBr.ll index 4f271a832e3db..660cc7115c808 100644 --- a/llvm/test/Transforms/LoopUnswitch/2012-04-02-IndirectBr.ll +++ b/llvm/test/Transforms/LoopUnswitch/2012-04-02-IndirectBr.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -S -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info | FileCheck %s -; RUN: opt < %s -S -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s +; RUN: opt < %s -S -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info -verify-memoryssa | FileCheck %s ; PR12343: -loop-unswitch -enable-new-pm=0 crash on indirect branch ; CHECK: %0 = icmp eq i64 undef, 0 diff --git a/llvm/test/Transforms/LoopUnswitch/2012-05-20-Phi.ll b/llvm/test/Transforms/LoopUnswitch/2012-05-20-Phi.ll index 13aadfac9f732..055856d0b2a9c 100644 --- a/llvm/test/Transforms/LoopUnswitch/2012-05-20-Phi.ll +++ b/llvm/test/Transforms/LoopUnswitch/2012-05-20-Phi.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output ; PR12887 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopUnswitch/2015-09-18-Addrspace.ll b/llvm/test/Transforms/LoopUnswitch/2015-09-18-Addrspace.ll index f50a3e6bf25dc..48be24d3165c2 100644 --- a/llvm/test/Transforms/LoopUnswitch/2015-09-18-Addrspace.ll +++ b/llvm/test/Transforms/LoopUnswitch/2015-09-18-Addrspace.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -S | FileCheck %s -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -S | FileCheck %s ; In cases where two address spaces do not have the same size pointer, the ; input for the addrspacecast should not be used as a substitute for itself diff --git a/llvm/test/Transforms/LoopUnswitch/LIV-loop-condtion.ll b/llvm/test/Transforms/LoopUnswitch/LIV-loop-condtion.ll index c93e3964edb08..af09e65ca4f94 100644 --- a/llvm/test/Transforms/LoopUnswitch/LIV-loop-condtion.ll +++ b/llvm/test/Transforms/LoopUnswitch/LIV-loop-condtion.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=0 -S 2>&1 | FileCheck %s -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S 2>&1 | FileCheck %s +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=0 -verify-memoryssa -S 2>&1 | FileCheck %s ; This is to test trivial loop unswitch only happens when trivial condition ; itself is an LIV loop condition (not partial LIV which could occur in and/or). diff --git a/llvm/test/Transforms/LoopUnswitch/basictest.ll b/llvm/test/Transforms/LoopUnswitch/basictest.ll index 812fb75820afb..61647c8f64442 100644 --- a/llvm/test/Transforms/LoopUnswitch/basictest.ll +++ b/llvm/test/Transforms/LoopUnswitch/basictest.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -S < %s 2>&1 | FileCheck %s -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s 2>&1 | FileCheck %s +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-memoryssa -S < %s 2>&1 | FileCheck %s define i32 @test(i32* %A, i1 %C) { entry: diff --git a/llvm/test/Transforms/LoopUnswitch/cleanuppad.ll b/llvm/test/Transforms/LoopUnswitch/cleanuppad.ll index fe50c058883cf..f0ffbb9caf9eb 100644 --- a/llvm/test/Transforms/LoopUnswitch/cleanuppad.ll +++ b/llvm/test/Transforms/LoopUnswitch/cleanuppad.ll @@ -1,5 +1,4 @@ -; RUN: opt -S -loop-unswitch -enable-new-pm=0 < %s | FileCheck %s -; RUN: opt -S -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-unswitch -enable-new-pm=0 -verify-memoryssa < %s | FileCheck %s target triple = "x86_64-pc-win32" define void @f(i32 %doit, i1 %x, i1 %y) personality i32 (...)* @__CxxFrameHandler3 { diff --git a/llvm/test/Transforms/LoopUnswitch/copy-metadata.ll b/llvm/test/Transforms/LoopUnswitch/copy-metadata.ll index 7b8b5b18a1a51..14fca405ecf17 100644 --- a/llvm/test/Transforms/LoopUnswitch/copy-metadata.ll +++ b/llvm/test/Transforms/LoopUnswitch/copy-metadata.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -S < %s 2>&1 | FileCheck %s -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s 2>&1 | FileCheck %s +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -S < %s 2>&1 | FileCheck %s ; This test checks if unswitched condition preserve make.implicit metadata. diff --git a/llvm/test/Transforms/LoopUnswitch/crash.ll b/llvm/test/Transforms/LoopUnswitch/crash.ll index c36a83ea8ace3..43cecd7377d97 100644 --- a/llvm/test/Transforms/LoopUnswitch/crash.ll +++ b/llvm/test/Transforms/LoopUnswitch/crash.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output define void @test1(i32* %S2) { entry: diff --git a/llvm/test/Transforms/LoopUnswitch/elseif-non-exponential-behavior.ll b/llvm/test/Transforms/LoopUnswitch/elseif-non-exponential-behavior.ll index a609d5d11dab7..0f08e47365058 100644 --- a/llvm/test/Transforms/LoopUnswitch/elseif-non-exponential-behavior.ll +++ b/llvm/test/Transforms/LoopUnswitch/elseif-non-exponential-behavior.ll @@ -1,5 +1,4 @@ -; RUN: opt -loop-unswitch -enable-new-pm=0 -S - < %s | FileCheck %s -; RUN: opt -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S - < %s | FileCheck %s +; RUN: opt -loop-unswitch -enable-new-pm=0 -verify-memoryssa -S - < %s | FileCheck %s ;CHECK-LABEL: @b ;CHECK: [[Loop1:for\.end.*]]: ; preds = %for.cond.us diff --git a/llvm/test/Transforms/LoopUnswitch/exponential-behavior.ll b/llvm/test/Transforms/LoopUnswitch/exponential-behavior.ll index 15bec6701f347..8a165ab942722 100644 --- a/llvm/test/Transforms/LoopUnswitch/exponential-behavior.ll +++ b/llvm/test/Transforms/LoopUnswitch/exponential-behavior.ll @@ -1,5 +1,5 @@ ; RUN: opt -loop-unswitch -enable-new-pm=0 -S < %s | FileCheck %s -; RUN: opt -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s +; RUN: opt -loop-unswitch -enable-new-pm=0 -verify-memoryssa -S < %s | FileCheck %s define void @f(i32 %n, i32* %ptr) { ; CHECK-LABEL: @f( diff --git a/llvm/test/Transforms/LoopUnswitch/guards.ll b/llvm/test/Transforms/LoopUnswitch/guards.ll index 727d488dd9307..e4aa9679b65ca 100644 --- a/llvm/test/Transforms/LoopUnswitch/guards.ll +++ b/llvm/test/Transforms/LoopUnswitch/guards.ll @@ -1,5 +1,4 @@ -; RUN: opt -S -loop-unswitch -enable-new-pm=0 < %s | FileCheck %s -; RUN: opt -S -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -loop-unswitch -enable-new-pm=0 -verify-memoryssa < %s | FileCheck %s declare void @llvm.experimental.guard(i1, ...) diff --git a/llvm/test/Transforms/LoopUnswitch/infinite-loop.ll b/llvm/test/Transforms/LoopUnswitch/infinite-loop.ll index 6e9cfa9559ceb..82aab701fce44 100644 --- a/llvm/test/Transforms/LoopUnswitch/infinite-loop.ll +++ b/llvm/test/Transforms/LoopUnswitch/infinite-loop.ll @@ -1,6 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -loop-unswitch -enable-new-pm=0 -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s -; RUN: opt -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s +; RUN: opt -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s ; RUN: opt -loop-unswitch -enable-new-pm=0 -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S < %s | FileCheck %s ; PR5373 diff --git a/llvm/test/Transforms/LoopUnswitch/invalidate-scev.ll b/llvm/test/Transforms/LoopUnswitch/invalidate-scev.ll index edd1e486f0a8c..b621640d7f263 100644 --- a/llvm/test/Transforms/LoopUnswitch/invalidate-scev.ll +++ b/llvm/test/Transforms/LoopUnswitch/invalidate-scev.ll @@ -1,5 +1,4 @@ -; RUN: opt -S -indvars -loop-unswitch -enable-new-pm=0 < %s | FileCheck %s -; RUN: opt -S -indvars -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -indvars -loop-unswitch -enable-new-pm=0 -verify-memoryssa < %s | FileCheck %s target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/LoopUnswitch/msan.ll b/llvm/test/Transforms/LoopUnswitch/msan.ll index 1dd6fb534f480..81fd64bfcf63d 100644 --- a/llvm/test/Transforms/LoopUnswitch/msan.ll +++ b/llvm/test/Transforms/LoopUnswitch/msan.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -S < %s 2>&1 | FileCheck %s -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s 2>&1 | FileCheck %s +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-memoryssa -S < %s 2>&1 | FileCheck %s @sink = global i32 0, align 4 @y = global i64 0, align 8 diff --git a/llvm/test/Transforms/LoopUnswitch/pr32818.ll b/llvm/test/Transforms/LoopUnswitch/pr32818.ll index c8d0508ef580f..6f22a7aa04e85 100644 --- a/llvm/test/Transforms/LoopUnswitch/pr32818.ll +++ b/llvm/test/Transforms/LoopUnswitch/pr32818.ll @@ -1,7 +1,6 @@ ; Check that the call doesn't get removed even if ; it has no uses. It could have side-effects. -; RUN: opt -loop-unswitch -enable-new-pm=0 -S %s | FileCheck %s -; RUN: opt -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S %s | FileCheck %s +; RUN: opt -loop-unswitch -enable-new-pm=0 -verify-memoryssa -S %s | FileCheck %s ; CHECK-LABEL: @tinky define i32 @tinkywinky(i8 %patatino) { diff --git a/llvm/test/Transforms/LoopUnswitch/preserve-analyses.ll b/llvm/test/Transforms/LoopUnswitch/preserve-analyses.ll index ae0f837d18b80..439dfda43ba27 100644 --- a/llvm/test/Transforms/LoopUnswitch/preserve-analyses.ll +++ b/llvm/test/Transforms/LoopUnswitch/preserve-analyses.ll @@ -1,5 +1,4 @@ -; RUN: opt -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info -disable-output < %s -; RUN: opt -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -disable-output < %s +; RUN: opt -loop-unswitch -enable-new-pm=0 -verify-memoryssa -verify-loop-info -verify-dom-info -disable-output < %s ; Loop unswitch should be able to unswitch these loops and ; preserve LCSSA and LoopSimplify forms. diff --git a/llvm/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll b/llvm/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll index c75752d0294b7..590e9b6da604c 100644 --- a/llvm/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll +++ b/llvm/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -S < %s 2>&1 | FileCheck %s -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s 2>&1 | FileCheck %s +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-memoryssa -S < %s 2>&1 | FileCheck %s ; There are 1 case and 1 default case in the switch. after we unswitch, we know the ; %a is definitely not 0 in one of the unswitched loop, make sure we take advantage diff --git a/llvm/test/Transforms/LoopUnswitch/trivial-unswitch.ll b/llvm/test/Transforms/LoopUnswitch/trivial-unswitch.ll index 0a76d081eb128..24d065eb7b834 100644 --- a/llvm/test/Transforms/LoopUnswitch/trivial-unswitch.ll +++ b/llvm/test/Transforms/LoopUnswitch/trivial-unswitch.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=0 -verify-loop-info -S < %s 2>&1 | FileCheck %s -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=0 -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s 2>&1 | FileCheck %s +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=0 -verify-loop-info -verify-memoryssa -S < %s 2>&1 | FileCheck %s ; This test contains two trivial unswitch condition in one loop. ; LoopUnswitch pass should be able to unswitch the second one diff --git a/llvm/test/Transforms/LoopUnswitch/unswitch-equality-undef.ll b/llvm/test/Transforms/LoopUnswitch/unswitch-equality-undef.ll index e81e07d165477..70e34aa791d13 100644 --- a/llvm/test/Transforms/LoopUnswitch/unswitch-equality-undef.ll +++ b/llvm/test/Transforms/LoopUnswitch/unswitch-equality-undef.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt < %s -instcombine -licm -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=1000 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output -stats 2>&1| FileCheck %s +; RUN: opt < %s -instcombine -licm -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=1000 -verify-memoryssa -disable-output -stats 2>&1| FileCheck %s ; Check no loop unswitch is done because unswitching of equality expr with ; undef is unsafe before the freeze patch is committed. ; CHECK-NOT: Number of branches unswitched diff --git a/llvm/test/Transforms/LoopUnswitch/unswitch-select.ll b/llvm/test/Transforms/LoopUnswitch/unswitch-select.ll index f67d0ffa01d55..9de9b32bdb3fe 100644 --- a/llvm/test/Transforms/LoopUnswitch/unswitch-select.ll +++ b/llvm/test/Transforms/LoopUnswitch/unswitch-select.ll @@ -1,6 +1,5 @@ ; REQUIRES: asserts -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output -stats 2>&1| FileCheck %s -; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output -stats 2>&1| FileCheck %s +; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output -stats 2>&1| FileCheck %s ; Check the select statement in the loop will be unswitched. ; CHECK: 1 loop-unswitch - Number of selects unswitched diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll index 0a769ec5da6b4..2c6424c235c46 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -disable-output -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output %struct.BLEND_MAP = type { i16, i16, i16, i32, %struct.BLEND_MAP_ENTRY* } %struct.BLEND_MAP_ENTRY = type { float, i8, { [5 x float], [4 x i8] } } diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll index 85066168e1e23..61460da1e5930 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -disable-output -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output define void @init_caller_save() { entry: diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll index 02c7a96deb5dc..6e0362e355880 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll @@ -1,6 +1,5 @@ ; PR1333 -; RUN: opt < %s -simple-loop-unswitch -disable-output -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64" target triple = "i686-pc-linux-gnu" diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll index a0408c8ea6a2e..13b78dde0ee32 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -disable-output -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output ; PR1333 define void @pp_cxx_expression() { diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll index 571e3eb6696df..65560369e3289 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -instcombine -disable-output -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -disable-output +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -instcombine -disable-output @str3 = external constant [3 x i8] ; <[3 x i8]*> [#uses=1] diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll index 626ac848cfb6b..644aaedde3a13 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -disable-output -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output define i32 @main(i32 %argc, i8** %argv) { entry: diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll index 52d96893060e1..e7c94cc35413f 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -disable-output -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output ; PR1559 target triple = "i686-pc-linux-gnu" diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll index 7c65459a65ccc..c389b2e5e1002 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -instcombine -disable-output -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -disable-output +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -instcombine -disable-output %struct.ClassDef = type { %struct.QByteArray, %struct.QByteArray, %"struct.QList", %"struct.QList", i8, i8, %"struct.QList", %"struct.QList", %"struct.QList", %"struct.QList", %"struct.QList", %"struct.QList", %"struct.QMap", %"struct.QList", %"struct.QMap", i32, i32 } %struct.FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct.FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] } %struct.Generator = type { %struct.FILE*, %struct.ClassDef*, %"struct.QList", %struct.QByteArray, %"struct.QList" } diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll index 5db1ced473f6a..ed61b1aca8241 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -instcombine -gvn -disable-output -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -gvn -disable-output +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -instcombine -gvn -disable-output ; PR2372 target triple = "i386-pc-linux-gnu" diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll index f3a382d813b8b..ece07c65f5ccb 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa ; PR8622 @g_38 = external global i32, align 4 diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll index b861d3029d566..9ee6f4c9dc860 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll @@ -1,5 +1,4 @@ -; RUN: opt -simple-loop-unswitch -disable-output < %s -; RUN: opt -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output < %s +; RUN: opt -simple-loop-unswitch -verify-memoryssa -disable-output < %s ; PR10031 define i32 @test(i32 %command) { diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll index 16886bfec034d..7133b2618792d 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -sroa -simple-loop-unswitch -disable-output -; RUN: opt < %s -sroa -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -sroa -simple-loop-unswitch -verify-memoryssa -disable-output ; PR11016 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-apple-macosx10.7.2" diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll index 72af984081877..61cc78f09ee98 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -S -simple-loop-unswitch -verify-loop-info -verify-dom-info | FileCheck %s -; RUN: opt < %s -S -simple-loop-unswitch -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s +; RUN: opt < %s -S -simple-loop-unswitch -verify-loop-info -verify-dom-info -verify-memoryssa | FileCheck %s ; PR12343: -simple-loop-unswitch crash on indirect branch ; CHECK: %0 = icmp eq i64 undef, 0 diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll index e9f35709ba730..feed1f0c93558 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -disable-output -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output ; PR12887 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll index c8de642a5d09e..74e918aaf0e74 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -S | FileCheck %s -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -S | FileCheck %s ; In cases where two address spaces do not have the same size pointer, the ; input for the addrspacecast should not be used as a substitute for itself diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll b/llvm/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll index 59c14e937b637..32a3959a91be7 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -S 2>&1 | FileCheck %s -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S 2>&1 | FileCheck %s +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -S 2>&1 | FileCheck %s ; This is to test trivial loop unswitch only happens when trivial condition ; itself is an LIV loop condition (not partial LIV which could occur in and/or). diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll b/llvm/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll index 1cade22b65905..2c949aa83cc17 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll @@ -1,5 +1,4 @@ -; RUN: opt -S -simple-loop-unswitch < %s | FileCheck %s -; RUN: opt -S -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s +; RUN: opt -S -simple-loop-unswitch -verify-memoryssa < %s | FileCheck %s target triple = "x86_64-pc-win32" define void @f(i32 %doit, i1 %x, i1 %y) personality i32 (...)* @__CxxFrameHandler3 { diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll b/llvm/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll index 09d7d792c7c6c..6ad72c6a1824c 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -S | FileCheck %s -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -S | FileCheck %s ; This test checks if unswitched condition preserve make.implicit metadata. define i32 @test(i1 %cond) { diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/crash.ll b/llvm/test/Transforms/SimpleLoopUnswitch/crash.ll index cf6a19d254019..554c30ebdd871 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/crash.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/crash.ll @@ -1,5 +1,4 @@ -; RUN: opt < %s -simple-loop-unswitch -disable-output -; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output +; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output define void @test1(i32* %S2) { entry: diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-behavior.ll b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-behavior.ll index 1c46ddbf51a86..5d309ee703076 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-behavior.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-behavior.ll @@ -1,5 +1,4 @@ -; RUN: opt -simple-loop-unswitch -S < %s | FileCheck %s -; RUN: opt -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s +; RUN: opt -simple-loop-unswitch -verify-memoryssa -S < %s | FileCheck %s define void @f(i32 %n, i32* %ptr) { ; CHECK-LABEL: @f( diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll b/llvm/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll index 91e1f486b8286..7169154849548 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll @@ -1,7 +1,6 @@ ; REQUIRES: asserts ; RUN: opt -simple-loop-unswitch -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s -; RUN: opt -simple-loop-unswitch -S < %s | FileCheck %s -; RUN: opt -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s +; RUN: opt -simple-loop-unswitch -verify-memoryssa -S < %s | FileCheck %s ; PR5373 ; Loop unswitching shouldn't trivially unswitch the true case of condition %a diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-cost.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-cost.ll index 56a86a01e28e6..1185fec70e1fe 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-cost.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-cost.ll @@ -2,8 +2,7 @@ ; ; RUN: opt -passes='loop(simple-loop-unswitch),verify' -unswitch-threshold=5 -S < %s | FileCheck %s ; RUN: opt -passes='loop-mssa(simple-loop-unswitch),verify' -unswitch-threshold=5 -S < %s | FileCheck %s -; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -unswitch-threshold=5 -S < %s | FileCheck %s -; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -unswitch-threshold=5 -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s +; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -unswitch-threshold=5 -verify-memoryssa -S < %s | FileCheck %s declare void @a() declare void @b() diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll index d0963b71f1ad9..8be18d5152399 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll @@ -1,7 +1,6 @@ ; RUN: opt -passes='loop(simple-loop-unswitch),verify' -S < %s | FileCheck %s ; RUN: opt -passes='loop-mssa(simple-loop-unswitch),verify' -S < %s | FileCheck %s -; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -S < %s | FileCheck %s -; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s +; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -verify-memoryssa -S < %s | FileCheck %s declare i32 @a() declare i32 @b() diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll b/llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll index e8e34a2e88240..2fc3d2bf6824d 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll @@ -1,5 +1,4 @@ -; RUN: opt -simple-loop-unswitch -loop-deletion -S < %s | FileCheck %s -; RUN: opt -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -loop-deletion -S < %s | FileCheck %s +; RUN: opt -simple-loop-unswitch -verify-memoryssa -loop-deletion -S < %s | FileCheck %s ; ; Check that when we do unswitching where we re-enqueue the loop to be processed ; again, but manage to delete the loop before ever getting to iterate on it, it diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll b/llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll index 114825348dadd..50dac9d4a433b 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll @@ -1,5 +1,4 @@ -; RUN: opt -simple-loop-unswitch -verify-loop-info -verify-dom-info -disable-output < %s -; RUN: opt -simple-loop-unswitch -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output < %s +; RUN: opt -simple-loop-unswitch -verify-loop-info -verify-dom-info -verify-memoryssa -disable-output < %s ; Loop unswitch should be able to unswitch these loops and ; preserve LCSSA and LoopSimplify forms. From 7776b19eed44906e9973bfb240b6279d6feaab41 Mon Sep 17 00:00:00 2001 From: Stephen Neuendorffer Date: Thu, 24 Sep 2020 11:54:46 -0700 Subject: [PATCH 040/700] [MLIR] Move TestDialect to ::test namespace While the changes are extensive, they basically fall into a few categories: 1) Moving the TestDialect itself. 2) Updating C++ code in tablegen to explicitly use ::mlir, since it will be put in a headers that shouldn't expect a 'using'. 3) Updating some generic MLIR Interface definitions to do the same thing. 4) Updating the Tablegen generator in a few places to be explicit about namespaces 5) Doing the same thing for llvm references, since we no longer pick up the definitions from mlir/Support/LLVM.h Differential Revision: https://reviews.llvm.org/D88251 --- mlir/include/mlir/IR/BuiltinOps.td | 2 +- mlir/include/mlir/IR/OpBase.td | 2 +- mlir/include/mlir/IR/RegionKindInterface.td | 4 +- .../mlir/Interfaces/InferTypeOpInterface.td | 4 +- mlir/test/lib/Dialect/Test/TestAttributes.cpp | 2 +- mlir/test/lib/Dialect/Test/TestDialect.cpp | 10 +- mlir/test/lib/Dialect/Test/TestDialect.h | 8 +- mlir/test/lib/Dialect/Test/TestInterfaces.td | 14 +-- mlir/test/lib/Dialect/Test/TestOps.td | 64 +++++------ mlir/test/lib/Dialect/Test/TestPatterns.cpp | 4 +- mlir/test/lib/Dialect/Test/TestTraits.cpp | 2 +- mlir/test/lib/Dialect/Test/TestTypeDefs.td | 16 +-- mlir/test/lib/Dialect/Test/TestTypes.cpp | 8 +- mlir/test/lib/Dialect/Test/TestTypes.h | 38 +++---- mlir/test/lib/IR/TestInterfaces.cpp | 2 +- mlir/test/lib/IR/TestTypes.cpp | 2 +- mlir/test/lib/Transforms/TestInlining.cpp | 2 +- mlir/test/mlir-tblgen/attrdefs.td | 26 ++--- mlir/test/mlir-tblgen/typedefs.td | 28 ++--- mlir/tools/mlir-opt/mlir-opt.cpp | 103 +++++++++--------- mlir/tools/mlir-reduce/mlir-reduce.cpp | 2 - mlir/unittests/IR/InterfaceAttachmentTest.cpp | 2 +- 22 files changed, 170 insertions(+), 175 deletions(-) diff --git a/mlir/include/mlir/IR/BuiltinOps.td b/mlir/include/mlir/IR/BuiltinOps.td index a05cfdd944714..8b6852b693936 100644 --- a/mlir/include/mlir/IR/BuiltinOps.td +++ b/mlir/include/mlir/IR/BuiltinOps.td @@ -115,7 +115,7 @@ def FuncOp : Builtin_Op<"func", [ /// Returns the region on the current operation that is callable. This may /// return null in the case of an external callable object, e.g. an external /// function. - Region *getCallableRegion() { return isExternal() ? nullptr : &getBody(); } + ::mlir::Region *getCallableRegion() { return isExternal() ? nullptr : &getBody(); } /// Returns the results types that the callable region produces when /// executed. diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td index c3fb49035f8a4..c24c05b877cf3 100644 --- a/mlir/include/mlir/IR/OpBase.td +++ b/mlir/include/mlir/IR/OpBase.td @@ -1618,7 +1618,7 @@ class DerivedAttr : } // Derived attribute that returns a mlir::Type. -class DerivedTypeAttr : DerivedAttr<"Type", body> { +class DerivedTypeAttr : DerivedAttr<"::mlir::Type", body> { let convertFromStorage = "::mlir::TypeAttr::get($_self)"; } diff --git a/mlir/include/mlir/IR/RegionKindInterface.td b/mlir/include/mlir/IR/RegionKindInterface.td index bef161c00def2..90c96ac21b43e 100644 --- a/mlir/include/mlir/IR/RegionKindInterface.td +++ b/mlir/include/mlir/IR/RegionKindInterface.td @@ -33,7 +33,7 @@ def RegionKindInterface : OpInterface<"RegionKindInterface"> { /*desc=*/[{ Return the kind of the region with the given index inside this operation. }], - /*retTy=*/"RegionKind", + /*retTy=*/"::mlir::RegionKind", /*methodName=*/"getRegionKind", /*args=*/(ins "unsigned":$index) >, @@ -44,7 +44,7 @@ def RegionKindInterface : OpInterface<"RegionKindInterface"> { /*methodName=*/"hasSSADominance", /*args=*/(ins "unsigned":$index), /*methodBody=*/[{ - return getRegionKind(index) == RegionKind::SSACFG; + return getRegionKind(index) == ::mlir::RegionKind::SSACFG; }] >, ]; diff --git a/mlir/include/mlir/Interfaces/InferTypeOpInterface.td b/mlir/include/mlir/Interfaces/InferTypeOpInterface.td index 0d2c2b520a178..fe7c8eeb2e134 100644 --- a/mlir/include/mlir/Interfaces/InferTypeOpInterface.td +++ b/mlir/include/mlir/Interfaces/InferTypeOpInterface.td @@ -168,10 +168,10 @@ def ReifyRankedShapedTypeOpInterface : rank of the corresponding result. If the shape of a particular result cannot be computed it must be empty. }], - /*retTy=*/"LogicalResult", + /*retTy=*/"::mlir::LogicalResult", /*methodName=*/"reifyResultShapes", /*args=*/(ins "::mlir::OpBuilder &":$builder, - "ReifiedRankedShapedTypeDims &":$reifiedReturnShapes) + "::mlir::ReifiedRankedShapedTypeDims &":$reifiedReturnShapes) > ]; } diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.cpp b/mlir/test/lib/Dialect/Test/TestAttributes.cpp index b6a2ca6145e27..94b9ea8429944 100644 --- a/mlir/test/lib/Dialect/Test/TestAttributes.cpp +++ b/mlir/test/lib/Dialect/Test/TestAttributes.cpp @@ -21,7 +21,7 @@ #include "llvm/ADT/TypeSwitch.h" using namespace mlir; -using namespace mlir::test; +using namespace test; //===----------------------------------------------------------------------===// // AttrWithSelfTypeParamAttr diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp index e56c2d1a92d0f..2a1f37119e2d5 100644 --- a/mlir/test/lib/Dialect/Test/TestDialect.cpp +++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp @@ -22,12 +22,14 @@ #include "mlir/Transforms/InliningUtils.h" #include "llvm/ADT/StringSwitch.h" -using namespace mlir; -using namespace mlir::test; - +// Include this before the using namespace lines below to +// test that we don't have namespace dependencies. #include "TestOpsDialect.cpp.inc" -void mlir::test::registerTestDialect(DialectRegistry ®istry) { +using namespace mlir; +using namespace test; + +void test::registerTestDialect(DialectRegistry ®istry) { registry.insert(); } diff --git a/mlir/test/lib/Dialect/Test/TestDialect.h b/mlir/test/lib/Dialect/Test/TestDialect.h index d57a2c119723a..5aca160c3f183 100644 --- a/mlir/test/lib/Dialect/Test/TestDialect.h +++ b/mlir/test/lib/Dialect/Test/TestDialect.h @@ -45,11 +45,9 @@ class RewritePatternSet; #define GET_OP_CLASSES #include "TestOps.h.inc" -namespace mlir { namespace test { -void registerTestDialect(DialectRegistry ®istry); -void populateTestReductionPatterns(RewritePatternSet &patterns); -} // namespace test -} // namespace mlir +void registerTestDialect(::mlir::DialectRegistry ®istry); +void populateTestReductionPatterns(::mlir::RewritePatternSet &patterns); +} // end namespace test #endif // MLIR_TESTDIALECT_H diff --git a/mlir/test/lib/Dialect/Test/TestInterfaces.td b/mlir/test/lib/Dialect/Test/TestInterfaces.td index 817f2f78bc914..1d9fd9c21e46d 100644 --- a/mlir/test/lib/Dialect/Test/TestInterfaces.td +++ b/mlir/test/lib/Dialect/Test/TestInterfaces.td @@ -14,26 +14,26 @@ include "mlir/Interfaces/SideEffectInterfaceBase.td" // A type interface used to test the ODS generation of type interfaces. def TestTypeInterface : TypeInterface<"TestTypeInterface"> { - let cppNamespace = "::mlir::test"; + let cppNamespace = "::test"; let methods = [ InterfaceMethod<"Prints the type name.", - "void", "printTypeA", (ins "Location":$loc), [{ + "void", "printTypeA", (ins "::mlir::Location":$loc), [{ emitRemark(loc) << $_type << " - TestA"; }] >, InterfaceMethod<"Prints the type name.", - "void", "printTypeB", (ins "Location":$loc), + "void", "printTypeB", (ins "::mlir::Location":$loc), [{}], /*defaultImplementation=*/[{ emitRemark(loc) << $_type << " - TestB"; }] >, InterfaceMethod<"Prints the type name.", - "void", "printTypeC", (ins "Location":$loc) + "void", "printTypeC", (ins "::mlir::Location":$loc) >, // It should be possible to use the interface type name as result type // as well as in the implementation. InterfaceMethod<"Prints the type name and returns the type as interface.", - "TestTypeInterface", "printTypeRet", (ins "Location":$loc), + "TestTypeInterface", "printTypeRet", (ins "::mlir::Location":$loc), [{}], /*defaultImplementation=*/[{ emitRemark(loc) << $_type << " - TestRet"; return $_type; @@ -42,13 +42,13 @@ def TestTypeInterface : TypeInterface<"TestTypeInterface"> { ]; let extraClassDeclaration = [{ /// Prints the type name. - void printTypeD(Location loc) const { + void printTypeD(::mlir::Location loc) const { emitRemark(loc) << *this << " - TestD"; } }]; let extraTraitClassDeclaration = [{ /// Prints the type name. - void printTypeE(Location loc) const { + void printTypeE(::mlir::Location loc) const { emitRemark(loc) << $_type << " - TestE"; } }]; diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index c8b656a75e0db..add66b421f1f2 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -24,7 +24,7 @@ include "TestInterfaces.td" def Test_Dialect : Dialect { let name = "test"; - let cppNamespace = "::mlir::test"; + let cppNamespace = "::test"; let hasCanonicalizer = 1; let hasConstantMaterializer = 1; let hasOperationAttrVerify = 1; @@ -38,16 +38,16 @@ def Test_Dialect : Dialect { void registerAttributes(); void registerTypes(); - Attribute parseAttribute(DialectAsmParser &parser, - Type type) const override; - void printAttribute(Attribute attr, - DialectAsmPrinter &printer) const override; + ::mlir::Attribute parseAttribute(::mlir::DialectAsmParser &parser, + ::mlir::Type type) const override; + void printAttribute(::mlir::Attribute attr, + ::mlir::DialectAsmPrinter &printer) const override; // Provides a custom printing/parsing for some operations. - Optional - getParseOperationHook(StringRef opName) const override; - LogicalResult printOperation(Operation *op, - OpAsmPrinter &printer) const override; + ::llvm::Optional + getParseOperationHook(::llvm::StringRef opName) const override; + ::mlir::LogicalResult printOperation(::mlir::Operation *op, + ::mlir::OpAsmPrinter &printer) const override; private: // Storage for a custom fallback interface. void *fallbackEffectOpInterfaces; @@ -117,8 +117,8 @@ def MultiTensorRankOf : TEST_Op<"multi_tensor_rank_of"> { } def TEST_TestType : DialectType()">, "test">, - BuildableType<"$_builder.getType<::mlir::test::TestType>()">; + CPred<"$_self.isa<::test::TestType>()">, "test">, + BuildableType<"$_builder.getType<::test::TestType>()">; //===----------------------------------------------------------------------===// // Test Symbols @@ -372,8 +372,8 @@ def ConversionCallOp : TEST_Op<"conversion_call_op", operand_range getArgOperands() { return inputs(); } /// Return the callee of this operation. - CallInterfaceCallable getCallableForCallee() { - return (*this)->getAttrOfType("callee"); + ::mlir::CallInterfaceCallable getCallableForCallee() { + return (*this)->getAttrOfType<::mlir::SymbolRefAttr>("callee"); } }]; } @@ -384,9 +384,9 @@ def FunctionalRegionOp : TEST_Op<"functional_region_op", let results = (outs FunctionType); let extraClassDeclaration = [{ - Region *getCallableRegion() { return &body(); } - ArrayRef getCallableResults() { - return getType().cast().getResults(); + ::mlir::Region *getCallableRegion() { return &body(); } + ::llvm::ArrayRef<::mlir::Type> getCallableResults() { + return getType().cast<::mlir::FunctionType>().getResults(); } }]; } @@ -748,7 +748,7 @@ def OpFuncRef : TEST_Op<"op_funcref"> { let description = [{ The "test.op_funcref" is a test op with a reference to a function symbol. }]; - let builders = [OpBuilder<(ins "FuncOp":$function)>]; + let builders = [OpBuilder<(ins "::mlir::FuncOp":$function)>]; } // Pattern add the argument plus a increasing static number hidden in @@ -898,10 +898,10 @@ def OpAllAttrConstraint2 : TEST_Op<"all_attr_constraint_of2"> { } def Constraint0 : AttrConstraint< CPred<"$_self.cast()[0]." - "cast().getInt() == 0">, + "cast<::mlir::IntegerAttr>().getInt() == 0">, "[0] == 0">; def Constraint1 : AttrConstraint< - CPred<"$_self.cast()[1].cast().getInt() == 1">, + CPred<"$_self.cast()[1].cast<::mlir::IntegerAttr>().getInt() == 1">, "[1] == 1">; def : Pat<(OpAllAttrConstraint1 AllAttrConstraintsOf<[Constraint0, Constraint1]>:$attr), @@ -917,7 +917,7 @@ def TestOpConstant : TEST_Op<"constant", [ConstantLike, NoSideEffect]> { let arguments = (ins AnyAttr:$value); let results = (outs AnyType); let extraClassDeclaration = [{ - Attribute getValue() { return (*this)->getAttr("value"); } + ::mlir::Attribute getValue() { return (*this)->getAttr("value"); } }]; let hasFolder = 1; @@ -1268,7 +1268,7 @@ def MixedVResultOp3 : TEST_Op<"mixed_variadic_out3", // We will use this op in a nested result pattern, where we cannot deduce the // result type. So need to provide a builder not requiring result types. let builders = [ - OpBuilder<(ins "IntegerAttr":$count), + OpBuilder<(ins "::mlir::IntegerAttr":$count), [{ auto i32Type = $_builder.getIntegerType(32); $_state.addTypes(i32Type); // $output1 @@ -1936,8 +1936,8 @@ def CopyOp : TEST_Op<"copy", [CopyOpInterface]> { attr-dict }]; let extraClassDeclaration = [{ - Value getSource() { return source(); } - Value getTarget() { return target(); } + ::mlir::Value getSource() { return source(); } + ::mlir::Value getTarget() { return target(); } }]; } @@ -2027,16 +2027,16 @@ def RegionIfOp : TEST_Op<"region_if", AnyRegion:$elseRegion, AnyRegion:$joinRegion); let extraClassDeclaration = [{ - Block::BlockArgListType getThenArgs() { + ::mlir::Block::BlockArgListType getThenArgs() { return getBody(0)->getArguments(); } - Block::BlockArgListType getElseArgs() { + ::mlir::Block::BlockArgListType getElseArgs() { return getBody(1)->getArguments(); } - Block::BlockArgListType getJoinArgs() { + ::mlir::Block::BlockArgListType getJoinArgs() { return getBody(2)->getArguments(); } - OperandRange getSuccessorEntryOperands(unsigned index); + ::mlir::OperandRange getSuccessorEntryOperands(unsigned index); }]; } @@ -2089,12 +2089,12 @@ def TableGenBuildOp5 : TEST_Op<"tblgen_build_5", let results = (outs AnyType:$result); let extraClassDeclaration = [{ - static LogicalResult inferReturnTypes(MLIRContext *, - Optional location, ValueRange operands, - DictionaryAttr attributes, RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { + static ::mlir::LogicalResult inferReturnTypes(::mlir::MLIRContext *, + ::llvm::Optional<::mlir::Location> location, ::mlir::ValueRange operands, + ::mlir::DictionaryAttr attributes, ::mlir::RegionRange regions, + ::llvm::SmallVectorImpl<::mlir::Type> &inferredReturnTypes) { inferredReturnTypes.assign({operands[0].getType()}); - return success(); + return ::mlir::success(); } }]; } diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index a6b0d970792f7..62bed7e0bba2c 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -17,7 +17,7 @@ #include "mlir/Transforms/GreedyPatternRewriteDriver.h" using namespace mlir; -using namespace mlir::test; +using namespace test; // Native function for testing NativeCodeCall static Value chooseOperand(Value input1, Value input2, BoolAttr choice) { @@ -67,7 +67,7 @@ namespace { // Test Reduce Pattern Interface //===----------------------------------------------------------------------===// -void mlir::test::populateTestReductionPatterns(RewritePatternSet &patterns) { +void test::populateTestReductionPatterns(RewritePatternSet &patterns) { populateWithGenerated(patterns); } diff --git a/mlir/test/lib/Dialect/Test/TestTraits.cpp b/mlir/test/lib/Dialect/Test/TestTraits.cpp index a1a78e724a584..bb78a4b175da1 100644 --- a/mlir/test/lib/Dialect/Test/TestTraits.cpp +++ b/mlir/test/lib/Dialect/Test/TestTraits.cpp @@ -11,7 +11,7 @@ #include "mlir/Transforms/GreedyPatternRewriteDriver.h" using namespace mlir; -using namespace mlir::test; +using namespace test; //===----------------------------------------------------------------------===// // Trait Folder. diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td index a5ae219780b4b..e11a042766bf0 100644 --- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td +++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td @@ -56,7 +56,7 @@ def IntegerType : Test_Type<"TestInteger"> { ins "unsigned":$width, // SignednessSemantics is defined below. - "::mlir::test::TestIntegerType::SignednessSemantics":$signedness + "::test::TestIntegerType::SignednessSemantics":$signedness ); // We define the printer inline. @@ -84,7 +84,7 @@ def IntegerType : Test_Type<"TestInteger"> { int width; if ($_parser.parseInteger(width)) return Type(); if ($_parser.parseGreater()) return Type(); - Location loc = $_parser.getEncodedSourceLoc($_parser.getNameLoc()); + ::mlir::Location loc = $_parser.getEncodedSourceLoc($_parser.getNameLoc()); return getChecked(loc, loc.getContext(), width, signedness); }]; @@ -114,7 +114,7 @@ class FieldInfo_Type : Test_Type { // An ArrayRef of something which requires allocation in the storage // constructor. ArrayRefOfSelfAllocationParameter< - "::mlir::test::FieldInfo", // FieldInfo is defined/declared in TestTypes.h. + "::test::FieldInfo", // FieldInfo is defined/declared in TestTypes.h. "Models struct fields">: $fields ); @@ -136,7 +136,7 @@ class FieldInfo_Type : Test_Type { llvm::SmallVector parameters; if ($_parser.parseLess()) return Type(); while (mlir::succeeded($_parser.parseOptionalLBrace())) { - StringRef name; + llvm::StringRef name; if ($_parser.parseKeyword(&name)) return Type(); if ($_parser.parseComma()) return Type(); Type type; @@ -166,12 +166,12 @@ def TestTypeWithLayoutType : Test_Type<"TestTypeWithLayout", [ let mnemonic = "test_type_with_layout"; let parameters = (ins "unsigned":$key); let extraClassDeclaration = [{ - LogicalResult verifyEntries(DataLayoutEntryListRef params, - Location loc) const; + ::mlir::LogicalResult verifyEntries(::mlir::DataLayoutEntryListRef params, + ::mlir::Location loc) const; private: - unsigned extractKind(DataLayoutEntryListRef params, - StringRef expectedKind) const; + unsigned extractKind(::mlir::DataLayoutEntryListRef params, + ::llvm::StringRef expectedKind) const; public: }]; diff --git a/mlir/test/lib/Dialect/Test/TestTypes.cpp b/mlir/test/lib/Dialect/Test/TestTypes.cpp index 6f01540c8b393..960fbbb5348ce 100644 --- a/mlir/test/lib/Dialect/Test/TestTypes.cpp +++ b/mlir/test/lib/Dialect/Test/TestTypes.cpp @@ -22,7 +22,7 @@ #include "llvm/ADT/TypeSwitch.h" using namespace mlir; -using namespace mlir::test; +using namespace test; // Custom parser for SignednessSemantics. static ParseResult @@ -65,7 +65,6 @@ static void printSignedness(DialectAsmPrinter &printer, // The functions don't need to be in the header file, but need to be in the mlir // namespace. Declare them here, then define them immediately below. Separating // the declaration and definition adheres to the LLVM coding standards. -namespace mlir { namespace test { // FieldInfo is used as part of a parameter, so equality comparison is // compulsory. @@ -73,16 +72,15 @@ static bool operator==(const FieldInfo &a, const FieldInfo &b); // FieldInfo is used as part of a parameter, so a hash will be computed. static llvm::hash_code hash_value(const FieldInfo &fi); // NOLINT } // namespace test -} // namespace mlir // FieldInfo is used as part of a parameter, so equality comparison is // compulsory. -static bool mlir::test::operator==(const FieldInfo &a, const FieldInfo &b) { +static bool test::operator==(const FieldInfo &a, const FieldInfo &b) { return a.name == b.name && a.type == b.type; } // FieldInfo is used as part of a parameter, so a hash will be computed. -static llvm::hash_code mlir::test::hash_value(const FieldInfo &fi) { // NOLINT +static llvm::hash_code test::hash_value(const FieldInfo &fi) { // NOLINT return llvm::hash_combine(fi.name, fi.type); } diff --git a/mlir/test/lib/Dialect/Test/TestTypes.h b/mlir/test/lib/Dialect/Test/TestTypes.h index f9a0289f20b01..7ee722197a25f 100644 --- a/mlir/test/lib/Dialect/Test/TestTypes.h +++ b/mlir/test/lib/Dialect/Test/TestTypes.h @@ -23,81 +23,77 @@ #include "mlir/IR/Types.h" #include "mlir/Interfaces/DataLayoutInterfaces.h" -namespace mlir { namespace test { /// FieldInfo represents a field in the StructType data type. It is used as a /// parameter in TestTypeDefs.td. struct FieldInfo { - StringRef name; - Type type; + ::llvm::StringRef name; + ::mlir::Type type; // Custom allocation called from generated constructor code - FieldInfo allocateInto(TypeStorageAllocator &alloc) const { + FieldInfo allocateInto(::mlir::TypeStorageAllocator &alloc) const { return FieldInfo{alloc.copyInto(name), type}; } }; } // namespace test -} // namespace mlir #include "TestTypeInterfaces.h.inc" #define GET_TYPEDEF_CLASSES #include "TestTypeDefs.h.inc" -namespace mlir { namespace test { /// Storage for simple named recursive types, where the type is identified by /// its name and can "contain" another type, including itself. -struct TestRecursiveTypeStorage : public TypeStorage { - using KeyTy = StringRef; +struct TestRecursiveTypeStorage : public ::mlir::TypeStorage { + using KeyTy = ::llvm::StringRef; - explicit TestRecursiveTypeStorage(StringRef key) : name(key), body(Type()) {} + explicit TestRecursiveTypeStorage(::llvm::StringRef key) : name(key), body(::mlir::Type()) {} bool operator==(const KeyTy &other) const { return name == other; } - static TestRecursiveTypeStorage *construct(TypeStorageAllocator &allocator, + static TestRecursiveTypeStorage *construct(::mlir::TypeStorageAllocator &allocator, const KeyTy &key) { return new (allocator.allocate()) TestRecursiveTypeStorage(allocator.copyInto(key)); } - LogicalResult mutate(TypeStorageAllocator &allocator, Type newBody) { + ::mlir::LogicalResult mutate(::mlir::TypeStorageAllocator &allocator, ::mlir::Type newBody) { // Cannot set a different body than before. if (body && body != newBody) - return failure(); + return ::mlir::failure(); body = newBody; - return success(); + return ::mlir::success(); } - StringRef name; - Type body; + ::llvm::StringRef name; + ::mlir::Type body; }; /// Simple recursive type identified by its name and pointing to another named /// type, potentially itself. This requires the body to be mutated separately /// from type creation. class TestRecursiveType - : public Type::TypeBase { + : public ::mlir::Type::TypeBase { public: using Base::Base; - static TestRecursiveType get(MLIRContext *ctx, StringRef name) { + static TestRecursiveType get(::mlir::MLIRContext *ctx, ::llvm::StringRef name) { return Base::get(ctx, name); } /// Body getter and setter. - LogicalResult setBody(Type body) { return Base::mutate(body); } - Type getBody() { return getImpl()->body; } + ::mlir::LogicalResult setBody(Type body) { return Base::mutate(body); } + ::mlir::Type getBody() { return getImpl()->body; } /// Name/key getter. - StringRef getName() { return getImpl()->name; } + ::llvm::StringRef getName() { return getImpl()->name; } }; } // namespace test -} // namespace mlir #endif // MLIR_TESTTYPES_H diff --git a/mlir/test/lib/IR/TestInterfaces.cpp b/mlir/test/lib/IR/TestInterfaces.cpp index d5e1d699502fb..bafcdcb5d0c6f 100644 --- a/mlir/test/lib/IR/TestInterfaces.cpp +++ b/mlir/test/lib/IR/TestInterfaces.cpp @@ -10,7 +10,7 @@ #include "mlir/Pass/Pass.h" using namespace mlir; -using namespace mlir::test; +using namespace test; namespace { /// This test checks various aspects of Type interface generation and diff --git a/mlir/test/lib/IR/TestTypes.cpp b/mlir/test/lib/IR/TestTypes.cpp index 0b6e003902341..7d078868f9dd8 100644 --- a/mlir/test/lib/IR/TestTypes.cpp +++ b/mlir/test/lib/IR/TestTypes.cpp @@ -11,7 +11,7 @@ #include "mlir/Pass/Pass.h" using namespace mlir; -using namespace mlir::test; +using namespace test; namespace { struct TestRecursiveTypesPass diff --git a/mlir/test/lib/Transforms/TestInlining.cpp b/mlir/test/lib/Transforms/TestInlining.cpp index 53a16679b512c..c88ee9e7c1c92 100644 --- a/mlir/test/lib/Transforms/TestInlining.cpp +++ b/mlir/test/lib/Transforms/TestInlining.cpp @@ -22,7 +22,7 @@ #include "llvm/ADT/StringSet.h" using namespace mlir; -using namespace mlir::test; +using namespace test; namespace { struct Inliner : public PassWrapper { diff --git a/mlir/test/mlir-tblgen/attrdefs.td b/mlir/test/mlir-tblgen/attrdefs.td index a5a41b3039918..c42da9433e434 100644 --- a/mlir/test/mlir-tblgen/attrdefs.td +++ b/mlir/test/mlir-tblgen/attrdefs.td @@ -13,21 +13,21 @@ include "mlir/IR/OpBase.td" // DEF: #ifdef GET_ATTRDEF_LIST // DEF: #undef GET_ATTRDEF_LIST -// DEF: ::mlir::test::SimpleAAttr, -// DEF: ::mlir::test::CompoundAAttr, -// DEF: ::mlir::test::IndexAttr, -// DEF: ::mlir::test::SingleParameterAttr +// DEF: ::test::SimpleAAttr, +// DEF: ::test::CompoundAAttr, +// DEF: ::test::IndexAttr, +// DEF: ::test::SingleParameterAttr // DEF-LABEL: ::mlir::OptionalParseResult generatedAttributeParser(::mlir::MLIRContext *context, // DEF-NEXT: ::mlir::DialectAsmParser &parser, // DEF-NEXT: ::llvm::StringRef mnemonic, ::mlir::Type type, // DEF-NEXT: ::mlir::Attribute &value) { -// DEF: if (mnemonic == ::mlir::test::CompoundAAttr::getMnemonic()) { -// DEF-NEXT: value = ::mlir::test::CompoundAAttr::parse(context, parser, type); +// DEF: if (mnemonic == ::test::CompoundAAttr::getMnemonic()) { +// DEF-NEXT: value = ::test::CompoundAAttr::parse(context, parser, type); // DEF-NEXT: return ::mlir::success(!!value); // DEF-NEXT: } -// DEF-NEXT: if (mnemonic == ::mlir::test::IndexAttr::getMnemonic()) { -// DEF-NEXT: value = ::mlir::test::IndexAttr::parse(context, parser, type); +// DEF-NEXT: if (mnemonic == ::test::IndexAttr::getMnemonic()) { +// DEF-NEXT: value = ::test::IndexAttr::parse(context, parser, type); // DEF-NEXT: return ::mlir::success(!!value); // DEF: return {}; @@ -35,7 +35,7 @@ def Test_Dialect: Dialect { // DECL-NOT: TestDialect // DEF-NOT: TestDialect let name = "TestDialect"; - let cppNamespace = "::mlir::test"; + let cppNamespace = "::test"; } class TestAttr : AttrDef { } @@ -52,7 +52,7 @@ def B_CompoundAttrA : TestAttr<"CompoundA"> { let parameters = ( ins "int":$widthOfSomething, - "::mlir::test::SimpleTypeA": $exampleTdType, + "::test::SimpleTypeA": $exampleTdType, APFloatParameter<"">: $apFloat, ArrayRefParameter<"int", "Matrix dimensions">:$dims, AttributeSelfTypeParameter<"">:$inner @@ -61,8 +61,8 @@ def B_CompoundAttrA : TestAttr<"CompoundA"> { let genVerifyDecl = 1; // DECL-LABEL: class CompoundAAttr : public ::mlir::Attribute -// DECL: static CompoundAAttr getChecked(llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, ::mlir::MLIRContext *context, int widthOfSomething, ::mlir::test::SimpleTypeA exampleTdType, ::llvm::APFloat apFloat, ::llvm::ArrayRef dims, ::mlir::Type inner); -// DECL: static ::mlir::LogicalResult verify(::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, int widthOfSomething, ::mlir::test::SimpleTypeA exampleTdType, ::llvm::APFloat apFloat, ::llvm::ArrayRef dims, ::mlir::Type inner); +// DECL: static CompoundAAttr getChecked(llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, ::mlir::MLIRContext *context, int widthOfSomething, ::test::SimpleTypeA exampleTdType, ::llvm::APFloat apFloat, ::llvm::ArrayRef dims, ::mlir::Type inner); +// DECL: static ::mlir::LogicalResult verify(::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, int widthOfSomething, ::test::SimpleTypeA exampleTdType, ::llvm::APFloat apFloat, ::llvm::ArrayRef dims, ::mlir::Type inner); // DECL: static constexpr ::llvm::StringLiteral getMnemonic() { // DECL: return ::llvm::StringLiteral("cmpnd_a"); // DECL: } @@ -70,7 +70,7 @@ def B_CompoundAttrA : TestAttr<"CompoundA"> { // DECL-NEXT: ::mlir::DialectAsmParser &parser, ::mlir::Type type); // DECL: void print(::mlir::DialectAsmPrinter &printer) const; // DECL: int getWidthOfSomething() const; -// DECL: ::mlir::test::SimpleTypeA getExampleTdType() const; +// DECL: ::test::SimpleTypeA getExampleTdType() const; // DECL: ::llvm::APFloat getApFloat() const; // Check that AttributeSelfTypeParameter is handled properly. diff --git a/mlir/test/mlir-tblgen/typedefs.td b/mlir/test/mlir-tblgen/typedefs.td index 69848116fbd36..cffc0ec58faf8 100644 --- a/mlir/test/mlir-tblgen/typedefs.td +++ b/mlir/test/mlir-tblgen/typedefs.td @@ -13,22 +13,22 @@ include "mlir/IR/OpBase.td" // DEF: #ifdef GET_TYPEDEF_LIST // DEF: #undef GET_TYPEDEF_LIST -// DEF: ::mlir::test::SimpleAType, -// DEF: ::mlir::test::CompoundAType, -// DEF: ::mlir::test::IndexType, -// DEF: ::mlir::test::SingleParameterType, -// DEF: ::mlir::test::IntegerType +// DEF: ::test::SimpleAType, +// DEF: ::test::CompoundAType, +// DEF: ::test::IndexType, +// DEF: ::test::SingleParameterType, +// DEF: ::test::IntegerType // DEF-LABEL: ::mlir::OptionalParseResult generatedTypeParser(::mlir::MLIRContext *context, // DEF-NEXT: ::mlir::DialectAsmParser &parser, // DEF-NEXT: ::llvm::StringRef mnemonic, // DEF-NEXT: ::mlir::Type &value) { -// DEF: if (mnemonic == ::mlir::test::CompoundAType::getMnemonic()) { -// DEF-NEXT: value = ::mlir::test::CompoundAType::parse(context, parser); +// DEF: if (mnemonic == ::test::CompoundAType::getMnemonic()) { +// DEF-NEXT: value = ::test::CompoundAType::parse(context, parser); // DEF-NEXT: return ::mlir::success(!!value); // DEF-NEXT: } -// DEF-NEXT: if (mnemonic == ::mlir::test::IndexType::getMnemonic()) { -// DEF-NEXT: value = ::mlir::test::IndexType::parse(context, parser); +// DEF-NEXT: if (mnemonic == ::test::IndexType::getMnemonic()) { +// DEF-NEXT: value = ::test::IndexType::parse(context, parser); // DEF-NEXT: return ::mlir::success(!!value); // DEF: return {}; @@ -36,7 +36,7 @@ def Test_Dialect: Dialect { // DECL-NOT: TestDialect // DEF-NOT: TestDialect let name = "TestDialect"; - let cppNamespace = "::mlir::test"; + let cppNamespace = "::test"; } class TestType : TypeDef { } @@ -57,7 +57,7 @@ def B_CompoundTypeA : TestType<"CompoundA"> { let parameters = ( ins "int":$widthOfSomething, - "::mlir::test::SimpleTypeA": $exampleTdType, + "::test::SimpleTypeA": $exampleTdType, "SomeCppStruct": $exampleCppType, ArrayRefParameter<"int", "Matrix dimensions">:$dims, RTLValueType:$inner @@ -66,8 +66,8 @@ def B_CompoundTypeA : TestType<"CompoundA"> { let genVerifyDecl = 1; // DECL-LABEL: class CompoundAType : public ::mlir::Type -// DECL: static CompoundAType getChecked(llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, ::mlir::MLIRContext *context, int widthOfSomething, ::mlir::test::SimpleTypeA exampleTdType, SomeCppStruct exampleCppType, ::llvm::ArrayRef dims, ::mlir::Type inner); -// DECL: static ::mlir::LogicalResult verify(::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, int widthOfSomething, ::mlir::test::SimpleTypeA exampleTdType, SomeCppStruct exampleCppType, ::llvm::ArrayRef dims, ::mlir::Type inner); +// DECL: static CompoundAType getChecked(llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, ::mlir::MLIRContext *context, int widthOfSomething, ::test::SimpleTypeA exampleTdType, SomeCppStruct exampleCppType, ::llvm::ArrayRef dims, ::mlir::Type inner); +// DECL: static ::mlir::LogicalResult verify(::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, int widthOfSomething, ::test::SimpleTypeA exampleTdType, SomeCppStruct exampleCppType, ::llvm::ArrayRef dims, ::mlir::Type inner); // DECL: static constexpr ::llvm::StringLiteral getMnemonic() { // DECL: return ::llvm::StringLiteral("cmpnd_a"); // DECL: } @@ -75,7 +75,7 @@ def B_CompoundTypeA : TestType<"CompoundA"> { // DECL-NEXT: ::mlir::DialectAsmParser &parser); // DECL: void print(::mlir::DialectAsmPrinter &printer) const; // DECL: int getWidthOfSomething() const; -// DECL: ::mlir::test::SimpleTypeA getExampleTdType() const; +// DECL: ::test::SimpleTypeA getExampleTdType() const; // DECL: SomeCppStruct getExampleCppType() const; } diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index efbd9ed883b1d..3059b1fafb96a 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -70,7 +70,6 @@ void registerTestGpuSerializeToHsacoPass(); void registerTestDataLayoutQuery(); void registerTestDecomposeCallGraphTypes(); void registerTestDiagnosticsPass(); -void registerTestDialect(DialectRegistry &); void registerTestDominancePass(); void registerTestDynamicPipelinePass(); void registerTestExpandTanhPass(); @@ -108,6 +107,10 @@ void registerTestVectorConversions(); } // namespace test } // namespace mlir +namespace test { +void registerTestDialect(DialectRegistry &); +} // namespace test + #ifdef MLIR_INCLUDE_TESTS void registerTestPasses() { registerConvertToTargetEnvPass(); @@ -135,58 +138,58 @@ void registerTestPasses() { registerVectorizerTestPass(); registerTosaTestQuantUtilAPIPass(); - test::registerConvertCallOpPass(); - test::registerInliner(); - test::registerMemRefBoundCheck(); - test::registerPatternsTestPass(); - test::registerSimpleParametricTilingPass(); - test::registerTestAffineLoopParametricTilingPass(); - test::registerTestAliasAnalysisPass(); - test::registerTestCallGraphPass(); - test::registerTestConstantFold(); - test::registerTestDiagnosticsPass(); + mlir::test::registerConvertCallOpPass(); + mlir::test::registerInliner(); + mlir::test::registerMemRefBoundCheck(); + mlir::test::registerPatternsTestPass(); + mlir::test::registerSimpleParametricTilingPass(); + mlir::test::registerTestAffineLoopParametricTilingPass(); + mlir::test::registerTestAliasAnalysisPass(); + mlir::test::registerTestCallGraphPass(); + mlir::test::registerTestConstantFold(); + mlir::test::registerTestDiagnosticsPass(); #if MLIR_CUDA_CONVERSIONS_ENABLED - test::registerTestGpuSerializeToCubinPass(); + mlir::test::registerTestGpuSerializeToCubinPass(); #endif #if MLIR_ROCM_CONVERSIONS_ENABLED - test::registerTestGpuSerializeToHsacoPass(); + mlir::test::registerTestGpuSerializeToHsacoPass(); #endif - test::registerTestConvVectorization(); - test::registerTestDecomposeCallGraphTypes(); - test::registerTestDataLayoutQuery(); - test::registerTestDominancePass(); - test::registerTestDynamicPipelinePass(); - test::registerTestExpandTanhPass(); - test::registerTestComposeSubView(); - test::registerTestGpuParallelLoopMappingPass(); - test::registerTestIRVisitorsPass(); - test::registerTestInterfaces(); - test::registerTestLinalgCodegenStrategy(); - test::registerTestLinalgDistribution(); - test::registerTestLinalgElementwiseFusion(); - test::registerTestPushExpandingReshape(); - test::registerTestLinalgFusionTransforms(); - test::registerTestLinalgTensorFusionTransforms(); - test::registerTestLinalgTiledLoopFusionTransforms(); - test::registerTestLinalgGreedyFusion(); - test::registerTestLinalgHoisting(); - test::registerTestLinalgTileAndFuseSequencePass(); - test::registerTestLinalgTransforms(); - test::registerTestLivenessPass(); - test::registerTestLoopFusion(); - test::registerTestLoopMappingPass(); - test::registerTestLoopUnrollingPass(); - test::registerTestMathAlgebraicSimplificationPass(); - test::registerTestMathPolynomialApproximationPass(); - test::registerTestMemRefDependenceCheck(); - test::registerTestMemRefStrideCalculation(); - test::registerTestNumberOfBlockExecutionsPass(); - test::registerTestNumberOfOperationExecutionsPass(); - test::registerTestOpaqueLoc(); - test::registerTestPDLByteCodePass(); - test::registerTestRecursiveTypesPass(); - test::registerTestSCFUtilsPass(); - test::registerTestVectorConversions(); + mlir::test::registerTestConvVectorization(); + mlir::test::registerTestDecomposeCallGraphTypes(); + mlir::test::registerTestDataLayoutQuery(); + mlir::test::registerTestDominancePass(); + mlir::test::registerTestDynamicPipelinePass(); + mlir::test::registerTestExpandTanhPass(); + mlir::test::registerTestComposeSubView(); + mlir::test::registerTestGpuParallelLoopMappingPass(); + mlir::test::registerTestIRVisitorsPass(); + mlir::test::registerTestInterfaces(); + mlir::test::registerTestLinalgCodegenStrategy(); + mlir::test::registerTestLinalgDistribution(); + mlir::test::registerTestLinalgElementwiseFusion(); + mlir::test::registerTestPushExpandingReshape(); + mlir::test::registerTestLinalgFusionTransforms(); + mlir::test::registerTestLinalgTensorFusionTransforms(); + mlir::test::registerTestLinalgTiledLoopFusionTransforms(); + mlir::test::registerTestLinalgGreedyFusion(); + mlir::test::registerTestLinalgHoisting(); + mlir::test::registerTestLinalgTileAndFuseSequencePass(); + mlir::test::registerTestLinalgTransforms(); + mlir::test::registerTestLivenessPass(); + mlir::test::registerTestLoopFusion(); + mlir::test::registerTestLoopMappingPass(); + mlir::test::registerTestLoopUnrollingPass(); + mlir::test::registerTestMathAlgebraicSimplificationPass(); + mlir::test::registerTestMathPolynomialApproximationPass(); + mlir::test::registerTestMemRefDependenceCheck(); + mlir::test::registerTestMemRefStrideCalculation(); + mlir::test::registerTestNumberOfBlockExecutionsPass(); + mlir::test::registerTestNumberOfOperationExecutionsPass(); + mlir::test::registerTestOpaqueLoc(); + mlir::test::registerTestPDLByteCodePass(); + mlir::test::registerTestRecursiveTypesPass(); + mlir::test::registerTestSCFUtilsPass(); + mlir::test::registerTestVectorConversions(); } #endif @@ -198,7 +201,7 @@ int main(int argc, char **argv) { DialectRegistry registry; registerAllDialects(registry); #ifdef MLIR_INCLUDE_TESTS - test::registerTestDialect(registry); + ::test::registerTestDialect(registry); #endif return mlir::asMainReturnCode( mlir::MlirOptMain(argc, argv, "MLIR modular optimizer driver\n", registry, diff --git a/mlir/tools/mlir-reduce/mlir-reduce.cpp b/mlir/tools/mlir-reduce/mlir-reduce.cpp index 01d7c96cd8bbc..44b21b805e8c3 100644 --- a/mlir/tools/mlir-reduce/mlir-reduce.cpp +++ b/mlir/tools/mlir-reduce/mlir-reduce.cpp @@ -21,13 +21,11 @@ using namespace mlir; -namespace mlir { namespace test { #ifdef MLIR_INCLUDE_TESTS void registerTestDialect(DialectRegistry &); #endif } // namespace test -} // namespace mlir int main(int argc, char **argv) { registerAllPasses(); diff --git a/mlir/unittests/IR/InterfaceAttachmentTest.cpp b/mlir/unittests/IR/InterfaceAttachmentTest.cpp index 76124707cbfc7..3b362fa221899 100644 --- a/mlir/unittests/IR/InterfaceAttachmentTest.cpp +++ b/mlir/unittests/IR/InterfaceAttachmentTest.cpp @@ -22,7 +22,7 @@ #include "../../test/lib/Dialect/Test/TestTypes.h" using namespace mlir; -using namespace mlir::test; +using namespace test; namespace { From 2b423509941c92766f3682055bef3ba8bfaf1416 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 14 Aug 2021 22:48:10 +0100 Subject: [PATCH 041/700] [InstCombine] Extend sadd.sat tests to include min/max patterns. NFC This tests code starting from smin/smax, as opposed to the icmp/select form. Also adds a ARM MVE phase ordering test for vectorizing to sadd.sat from the original IR. --- llvm/test/Transforms/InstCombine/sadd_sat.ll | 195 ++++++++++++++++++ .../PhaseOrdering/ARM/arm_add_q7.ll | 183 ++++++++++++++++ 2 files changed, 378 insertions(+) create mode 100644 llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll diff --git a/llvm/test/Transforms/InstCombine/sadd_sat.ll b/llvm/test/Transforms/InstCombine/sadd_sat.ll index 04dd4f5c038dc..ff4a5e656fec8 100644 --- a/llvm/test/Transforms/InstCombine/sadd_sat.ll +++ b/llvm/test/Transforms/InstCombine/sadd_sat.ll @@ -21,6 +21,27 @@ entry: ret i32 %conv7 } +define i32 @sadd_sat32_mm(i32 %a, i32 %b) { +; CHECK-LABEL: @sadd_sat32_mm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[A:%.*]] to i64 +; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[CONV1]], [[CONV]] +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 2147483647) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648) +; CHECK-NEXT: [[CONV7:%.*]] = trunc i64 [[SPEC_STORE_SELECT8]] to i32 +; CHECK-NEXT: ret i32 [[CONV7]] +; +entry: + %conv = sext i32 %a to i64 + %conv1 = sext i32 %b to i64 + %add = add i64 %conv1, %conv + %spec.store.select = call i64 @llvm.smin.i64(i64 %add, i64 2147483647) + %spec.store.select8 = call i64 @llvm.smax.i64(i64 %spec.store.select, i64 -2147483648) + %conv7 = trunc i64 %spec.store.select8 to i32 + ret i32 %conv7 +} + define i32 @ssub_sat32(i32 %a, i32 %b) { ; CHECK-LABEL: @ssub_sat32( ; CHECK-NEXT: entry: @@ -39,6 +60,27 @@ entry: ret i32 %conv7 } +define i32 @ssub_sat32_mm(i32 %a, i32 %b) { +; CHECK-LABEL: @ssub_sat32_mm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[A:%.*]] to i64 +; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i64 [[CONV]], [[CONV1]] +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[SUB]], i64 2147483647) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648) +; CHECK-NEXT: [[CONV7:%.*]] = trunc i64 [[SPEC_STORE_SELECT8]] to i32 +; CHECK-NEXT: ret i32 [[CONV7]] +; +entry: + %conv = sext i32 %a to i64 + %conv1 = sext i32 %b to i64 + %sub = sub i64 %conv, %conv1 + %spec.store.select = call i64 @llvm.smin.i64(i64 %sub, i64 2147483647) + %spec.store.select8 = call i64 @llvm.smax.i64(i64 %spec.store.select, i64 -2147483648) + %conv7 = trunc i64 %spec.store.select8 to i32 + ret i32 %conv7 +} + define i32 @smul_sat32(i32 %a, i32 %b) { ; CHECK-LABEL: @smul_sat32( ; CHECK-NEXT: entry: @@ -64,6 +106,27 @@ entry: ret i32 %conv7 } +define i32 @smul_sat32_mm(i32 %a, i32 %b) { +; CHECK-LABEL: @smul_sat32_mm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[A:%.*]] to i64 +; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = mul nsw i64 [[CONV1]], [[CONV]] +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 2147483647) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648) +; CHECK-NEXT: [[CONV7:%.*]] = trunc i64 [[SPEC_STORE_SELECT8]] to i32 +; CHECK-NEXT: ret i32 [[CONV7]] +; +entry: + %conv = sext i32 %a to i64 + %conv1 = sext i32 %b to i64 + %add = mul i64 %conv1, %conv + %spec.store.select = call i64 @llvm.smin.i64(i64 %add, i64 2147483647) + %spec.store.select8 = call i64 @llvm.smax.i64(i64 %spec.store.select, i64 -2147483648) + %conv7 = trunc i64 %spec.store.select8 to i32 + ret i32 %conv7 +} + define signext i16 @sadd_sat16(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: @sadd_sat16( ; CHECK-NEXT: entry: @@ -82,6 +145,27 @@ entry: ret i16 %conv9 } +define signext i16 @sadd_sat16_mm(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: @sadd_sat16_mm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[B:%.*]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV1]], [[CONV]] +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i32 @llvm.smin.i32(i32 [[ADD]], i32 32767) +; CHECK-NEXT: [[SPEC_STORE_SELECT10:%.*]] = call i32 @llvm.smax.i32(i32 [[SPEC_STORE_SELECT]], i32 -32768) +; CHECK-NEXT: [[CONV9:%.*]] = trunc i32 [[SPEC_STORE_SELECT10]] to i16 +; CHECK-NEXT: ret i16 [[CONV9]] +; +entry: + %conv = sext i16 %a to i32 + %conv1 = sext i16 %b to i32 + %add = add i32 %conv1, %conv + %spec.store.select = call i32 @llvm.smin.i32(i32 %add, i32 32767) + %spec.store.select10 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 -32768) + %conv9 = trunc i32 %spec.store.select10 to i16 + ret i16 %conv9 +} + define signext i16 @ssub_sat16(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: @ssub_sat16( ; CHECK-NEXT: entry: @@ -100,6 +184,27 @@ entry: ret i16 %conv9 } +define signext i16 @ssub_sat16_mm(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: @ssub_sat16_mm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[B:%.*]] to i32 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV1]] +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i32 @llvm.smin.i32(i32 [[SUB]], i32 32767) +; CHECK-NEXT: [[SPEC_STORE_SELECT10:%.*]] = call i32 @llvm.smax.i32(i32 [[SPEC_STORE_SELECT]], i32 -32768) +; CHECK-NEXT: [[CONV9:%.*]] = trunc i32 [[SPEC_STORE_SELECT10]] to i16 +; CHECK-NEXT: ret i16 [[CONV9]] +; +entry: + %conv = sext i16 %a to i32 + %conv1 = sext i16 %b to i32 + %sub = sub i32 %conv, %conv1 + %spec.store.select = call i32 @llvm.smin.i32(i32 %sub, i32 32767) + %spec.store.select10 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 -32768) + %conv9 = trunc i32 %spec.store.select10 to i16 + ret i16 %conv9 +} + define signext i8 @sadd_sat8(i8 signext %a, i8 signext %b) { ; CHECK-LABEL: @sadd_sat8( ; CHECK-NEXT: entry: @@ -118,6 +223,27 @@ entry: ret i8 %conv9 } +define signext i8 @sadd_sat8_mm(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: @sadd_sat8_mm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[A:%.*]] to i32 +; CHECK-NEXT: [[CONV1:%.*]] = sext i8 [[B:%.*]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV1]], [[CONV]] +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i32 @llvm.smin.i32(i32 [[ADD]], i32 127) +; CHECK-NEXT: [[SPEC_STORE_SELECT10:%.*]] = call i32 @llvm.smax.i32(i32 [[SPEC_STORE_SELECT]], i32 -128) +; CHECK-NEXT: [[CONV9:%.*]] = trunc i32 [[SPEC_STORE_SELECT10]] to i8 +; CHECK-NEXT: ret i8 [[CONV9]] +; +entry: + %conv = sext i8 %a to i32 + %conv1 = sext i8 %b to i32 + %add = add i32 %conv1, %conv + %spec.store.select = call i32 @llvm.smin.i32(i32 %add, i32 127) + %spec.store.select10 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 -128) + %conv9 = trunc i32 %spec.store.select10 to i8 + ret i8 %conv9 +} + define signext i8 @ssub_sat8(i8 signext %a, i8 signext %b) { ; CHECK-LABEL: @ssub_sat8( ; CHECK-NEXT: entry: @@ -136,6 +262,27 @@ entry: ret i8 %conv9 } +define signext i8 @ssub_sat8_mm(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: @ssub_sat8_mm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[A:%.*]] to i32 +; CHECK-NEXT: [[CONV1:%.*]] = sext i8 [[B:%.*]] to i32 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV1]] +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i32 @llvm.smin.i32(i32 [[SUB]], i32 127) +; CHECK-NEXT: [[SPEC_STORE_SELECT10:%.*]] = call i32 @llvm.smax.i32(i32 [[SPEC_STORE_SELECT]], i32 -128) +; CHECK-NEXT: [[CONV9:%.*]] = trunc i32 [[SPEC_STORE_SELECT10]] to i8 +; CHECK-NEXT: ret i8 [[CONV9]] +; +entry: + %conv = sext i8 %a to i32 + %conv1 = sext i8 %b to i32 + %sub = sub i32 %conv, %conv1 + %spec.store.select = call i32 @llvm.smin.i32(i32 %sub, i32 127) + %spec.store.select10 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 -128) + %conv9 = trunc i32 %spec.store.select10 to i8 + ret i8 %conv9 +} + define signext i64 @sadd_sat64(i64 signext %a, i64 signext %b) { ; CHECK-LABEL: @sadd_sat64( ; CHECK-NEXT: entry: @@ -240,6 +387,27 @@ entry: ret <4 x i32> %conv7 } +define <4 x i32> @sadd_satv4i32_mm(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @sadd_satv4i32_mm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext <4 x i32> [[A:%.*]] to <4 x i64> +; CHECK-NEXT: [[CONV1:%.*]] = sext <4 x i32> [[B:%.*]] to <4 x i64> +; CHECK-NEXT: [[ADD:%.*]] = add nsw <4 x i64> [[CONV1]], [[CONV]] +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[ADD]], <4 x i64> ) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[SPEC_STORE_SELECT]], <4 x i64> ) +; CHECK-NEXT: [[CONV7:%.*]] = trunc <4 x i64> [[SPEC_STORE_SELECT8]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[CONV7]] +; +entry: + %conv = sext <4 x i32> %a to <4 x i64> + %conv1 = sext <4 x i32> %b to <4 x i64> + %add = add <4 x i64> %conv1, %conv + %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %add, <4 x i64> ) + %spec.store.select8 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %spec.store.select, <4 x i64> ) + %conv7 = trunc <4 x i64> %spec.store.select8 to <4 x i32> + ret <4 x i32> %conv7 +} + define <4 x i32> @ssub_satv4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @ssub_satv4i32( ; CHECK-NEXT: entry: @@ -258,6 +426,27 @@ entry: ret <4 x i32> %conv7 } +define <4 x i32> @ssub_satv4i32_mm(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @ssub_satv4i32_mm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext <4 x i32> [[A:%.*]] to <4 x i64> +; CHECK-NEXT: [[CONV1:%.*]] = sext <4 x i32> [[B:%.*]] to <4 x i64> +; CHECK-NEXT: [[ADD:%.*]] = sub nsw <4 x i64> [[CONV1]], [[CONV]] +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[ADD]], <4 x i64> ) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[SPEC_STORE_SELECT]], <4 x i64> ) +; CHECK-NEXT: [[CONV7:%.*]] = trunc <4 x i64> [[SPEC_STORE_SELECT8]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[CONV7]] +; +entry: + %conv = sext <4 x i32> %a to <4 x i64> + %conv1 = sext <4 x i32> %b to <4 x i64> + %add = sub <4 x i64> %conv1, %conv + %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %add, <4 x i64> ) + %spec.store.select8 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %spec.store.select, <4 x i64> ) + %conv7 = trunc <4 x i64> %spec.store.select8 to <4 x i32> + ret <4 x i32> %conv7 +} + define <4 x i32> @sadd_satv4i4(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @sadd_satv4i4( ; CHECK-NEXT: entry: @@ -501,3 +690,9 @@ entry: } declare void @use64(i64) +declare i64 @llvm.smin.i64(i64, i64) +declare i64 @llvm.smax.i64(i64, i64) +declare i32 @llvm.smin.i32(i32, i32) +declare i32 @llvm.smax.i32(i32, i32) +declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>) +declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>) diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll new file mode 100644 index 0000000000000..cacedbb7aa962 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes='default' -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-arm-none-eabi" + +; This should, after inlining and simplification, be a single tail predicated +; 16x vector loop handling llvm.sadd.sat. __SSAT is inlined and so is DCE'd. + +; Function Attrs: nounwind +define dso_local void @arm_add_q7(i8* %pSrcA, i8* %pSrcB, i8* noalias %pDst, i32 %blockSize) #0 { +; CHECK-LABEL: @arm_add_q7( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP_NOT3:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP_NOT3]], label [[WHILE_END:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[BLOCKSIZE]], 15 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PSRCA:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP14:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, i8* [[PSRCB:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[BLOCKSIZE]]) +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[NEXT_GEP]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[NEXT_GEP15]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[WIDE_MASKED_LOAD]], <16 x i8> [[WIDE_MASKED_LOAD18]]) +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[NEXT_GEP14]] to <16 x i8>* +; CHECK-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[TMP2]], <16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[WHILE_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: while.end: +; CHECK-NEXT: ret void +; +entry: + %pSrcA.addr = alloca i8*, align 4 + %pSrcB.addr = alloca i8*, align 4 + %pDst.addr = alloca i8*, align 4 + %blockSize.addr = alloca i32, align 4 + %blkCnt = alloca i32, align 4 + store i8* %pSrcA, i8** %pSrcA.addr, align 4 + store i8* %pSrcB, i8** %pSrcB.addr, align 4 + store i8* %pDst, i8** %pDst.addr, align 4 + store i32 %blockSize, i32* %blockSize.addr, align 4 + %0 = bitcast i32* %blkCnt to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) + %1 = load i32, i32* %blockSize.addr, align 4 + store i32 %1, i32* %blkCnt, align 4 + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %2 = load i32, i32* %blkCnt, align 4 + %cmp = icmp ugt i32 %2, 0 + br i1 %cmp, label %while.body, label %while.end + +while.body: ; preds = %while.cond + %3 = load i8*, i8** %pSrcA.addr, align 4 + %incdec.ptr = getelementptr inbounds i8, i8* %3, i32 1 + store i8* %incdec.ptr, i8** %pSrcA.addr, align 4 + %4 = load i8, i8* %3, align 1 + %conv = sext i8 %4 to i16 + %conv1 = sext i16 %conv to i32 + %5 = load i8*, i8** %pSrcB.addr, align 4 + %incdec.ptr2 = getelementptr inbounds i8, i8* %5, i32 1 + store i8* %incdec.ptr2, i8** %pSrcB.addr, align 4 + %6 = load i8, i8* %5, align 1 + %conv3 = sext i8 %6 to i32 + %add = add nsw i32 %conv1, %conv3 + %call = call i32 @__SSAT(i32 %add, i32 8) + %conv4 = trunc i32 %call to i8 + %7 = load i8*, i8** %pDst.addr, align 4 + %incdec.ptr5 = getelementptr inbounds i8, i8* %7, i32 1 + store i8* %incdec.ptr5, i8** %pDst.addr, align 4 + store i8 %conv4, i8* %7, align 1 + %8 = load i32, i32* %blkCnt, align 4 + %dec = add i32 %8, -1 + store i32 %dec, i32* %blkCnt, align 4 + br label %while.cond + +while.end: ; preds = %while.cond + %9 = bitcast i32* %blkCnt to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %9) + ret void +} + +define internal i32 @__SSAT(i32 %val, i32 %sat) #1 { +entry: + %retval = alloca i32, align 4 + %val.addr = alloca i32, align 4 + %sat.addr = alloca i32, align 4 + %max = alloca i32, align 4 + %min = alloca i32, align 4 + %cleanup.dest.slot = alloca i32, align 4 + store i32 %val, i32* %val.addr, align 4 + store i32 %sat, i32* %sat.addr, align 4 + %0 = load i32, i32* %sat.addr, align 4 + %cmp = icmp uge i32 %0, 1 + br i1 %cmp, label %land.lhs.true, label %if.end10 + +land.lhs.true: ; preds = %entry + %1 = load i32, i32* %sat.addr, align 4 + %cmp1 = icmp ule i32 %1, 32 + br i1 %cmp1, label %if.then, label %if.end10 + +if.then: ; preds = %land.lhs.true + %2 = bitcast i32* %max to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) + %3 = load i32, i32* %sat.addr, align 4 + %sub = sub i32 %3, 1 + %shl = shl i32 1, %sub + %sub2 = sub i32 %shl, 1 + store i32 %sub2, i32* %max, align 4 + %4 = bitcast i32* %min to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) + %5 = load i32, i32* %max, align 4 + %sub3 = sub nsw i32 -1, %5 + store i32 %sub3, i32* %min, align 4 + %6 = load i32, i32* %val.addr, align 4 + %7 = load i32, i32* %max, align 4 + %cmp4 = icmp sgt i32 %6, %7 + br i1 %cmp4, label %if.then5, label %if.else + +if.then5: ; preds = %if.then + %8 = load i32, i32* %max, align 4 + store i32 %8, i32* %retval, align 4 + store i32 1, i32* %cleanup.dest.slot, align 4 + br label %cleanup + +if.else: ; preds = %if.then + %9 = load i32, i32* %val.addr, align 4 + %10 = load i32, i32* %min, align 4 + %cmp6 = icmp slt i32 %9, %10 + br i1 %cmp6, label %if.then7, label %if.end + +if.then7: ; preds = %if.else + %11 = load i32, i32* %min, align 4 + store i32 %11, i32* %retval, align 4 + store i32 1, i32* %cleanup.dest.slot, align 4 + br label %cleanup + +if.end: ; preds = %if.else + br label %if.end8 + +if.end8: ; preds = %if.end + store i32 0, i32* %cleanup.dest.slot, align 4 + br label %cleanup + +cleanup: ; preds = %if.end8, %if.then7, %if.then5 + %12 = bitcast i32* %min to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %12) + %13 = bitcast i32* %max to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %13) + %cleanup.dest = load i32, i32* %cleanup.dest.slot, align 4 + switch i32 %cleanup.dest, label %unreachable [ + i32 0, label %cleanup.cont + i32 1, label %return + ] + +cleanup.cont: ; preds = %cleanup + br label %if.end10 + +if.end10: ; preds = %cleanup.cont, %land.lhs.true, %entry + %14 = load i32, i32* %val.addr, align 4 + store i32 %14, i32* %retval, align 4 + br label %return + +return: ; preds = %if.end10, %cleanup + %15 = load i32, i32* %retval, align 4 + ret i32 %15 + +unreachable: ; preds = %cleanup + unreachable +} + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) + +attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" } +attributes #1 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" } From 7aef2e54c893b9018990e6183d7c71a1fdfd8813 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sat, 14 Aug 2021 17:58:21 -0400 Subject: [PATCH 042/700] Simplify a .mailmap entry Only one person committed with these email addresses, so there's no need to use the map-different-names-for-one-email-address syntax. No behavior change. Differential Revision: https://reviews.llvm.org/D108007 --- .mailmap | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.mailmap b/.mailmap index 05877be16ce29..2134bebcc5b13 100644 --- a/.mailmap +++ b/.mailmap @@ -28,9 +28,8 @@ -Jon Roelofs Jon Roelofs -Jon Roelofs Jonathan Roelofs -Jon Roelofs Jonathan Roelofs +Jon Roelofs +Jon Roelofs LLVM GN Syncbot Martin Storsjö Saleem Abdulrasool From 918dad54bd226a8c2fda42bc63dbb1f324bf6c24 Mon Sep 17 00:00:00 2001 From: Rainer Orth Date: Sun, 15 Aug 2021 00:20:47 +0200 Subject: [PATCH 043/700] [test] Avoid unportable echo in Other/lit-quoting.txt `LLVM :: Other/lit-quoting.txt` currently `FAIL`s on Solaris: llvm/test/Other/lit-quoting.txt:8:9: error: CHECK2: expected string not found in input CHECK2: {{^a\[b\\c$}} ^ :1:1: note: scanning from here a[b ^ This happens because echo with backslashes or special characters is unportable, as extensively documented in the Autoconf manual. In the case at hand, `echo 'a[b\c'` yields `a[b\c` on Linux, but `a[b` (no newline) on Solaris. This patch fixes this by using the portable alternative suggested in the Autoconf manual. Tested on `amd64-pc-solaris2.11`, `sparcv9-sun-solaris2.11`, and `x86_64-pc-linux-gnu`. Differential Revision: https://reviews.llvm.org/D108031 --- llvm/test/Other/lit-quoting.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/test/Other/lit-quoting.txt b/llvm/test/Other/lit-quoting.txt index 48a72e20e4603..76dc67956a290 100644 --- a/llvm/test/Other/lit-quoting.txt +++ b/llvm/test/Other/lit-quoting.txt @@ -1,14 +1,14 @@ -RUN: echo "\"" | FileCheck %s --check-prefix=CHECK1 -RUN: echo '"' | FileCheck %s --check-prefix=CHECK1 -RUN: echo 'a[b\c' | FileCheck %s --check-prefix=CHECK2 -RUN: echo "a[b\\c" | FileCheck %s --check-prefix=CHECK2 -RUN: echo 'a\b\\c\\\\d' | FileCheck %s --check-prefix=CHECK3 -RUN: echo "a\\b\\\\c\\\\\\\\d" | FileCheck %s --check-prefix=CHECK3 +RUN: printf "%%s\n" "\"" | FileCheck %s --check-prefix=CHECK1 +RUN: printf "%%s\n" '"' | FileCheck %s --check-prefix=CHECK1 +RUN: printf "%%s\n" 'a[b\c' | FileCheck %s --check-prefix=CHECK2 +RUN: printf "%%s\n" "a[b\\c" | FileCheck %s --check-prefix=CHECK2 +RUN: printf "%%s\n" 'a\b\\c\\\\d' | FileCheck %s --check-prefix=CHECK3 +RUN: printf "%%s\n" "a\\b\\\\c\\\\\\\\d" | FileCheck %s --check-prefix=CHECK3 CHECK1: {{^"$}} CHECK2: {{^a\[b\\c$}} CHECK3: {{^a\\b\\\\c\\\\\\\\d$}} On Windows, with MSYS based tools, the following commands fail though: -RUNX: echo 'a[b\c\\d' | FileCheck %s --check-prefix=CHECK4 -RUNX: echo "a[b\\c\\\\d" | FileCheck %s --check-prefix=CHECK4 +RUNX: printf "%%s\n" 'a[b\c\\d' | FileCheck %s --check-prefix=CHECK4 +RUNX: printf "%%s\n" "a[b\\c\\\\d" | FileCheck %s --check-prefix=CHECK4 CHECK4: {{^a\[b\\c\\\\d$}} From 73c4c3276720b20525ce9ef5f8e4f0c20fd93862 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 14 Aug 2021 16:34:52 -0700 Subject: [PATCH 044/700] [X86] Use __builtin_bit_cast _mm_extract_ps instead of type punning through a union. NFC --- clang/lib/Headers/smmintrin.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h index 3ee58c9d79370..c55e6dc65c49d 100644 --- a/clang/lib/Headers/smmintrin.h +++ b/clang/lib/Headers/smmintrin.h @@ -865,10 +865,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 10: Bits [95:64] of parameter \a X are returned. \n /// 11: Bits [127:96] of parameter \a X are returned. /// \returns A 32-bit integer containing the extracted 32 bits of float data. -#define _mm_extract_ps(X, N) (__extension__ \ - ({ union { int __i; float __f; } __t; \ - __t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \ - __t.__i;})) +#define _mm_extract_ps(X, N) \ + __builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N))) /* Miscellaneous insert and extract macros. */ /* Extract a single-precision float from X at index N into D. */ From d2cb18918498b8a39657af2a495eba3e983c159b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 14 Aug 2021 16:41:50 -0700 Subject: [PATCH 045/700] [X86] Use a do {} while (0) in the _MM_EXTRACT_FLOAT implementation. Previously we just used {}, but that doesn't work in situations like this. if (1) _MM_EXTRACT_FLOAT(d, x, n); else ... The semicolon would terminate the if. --- clang/lib/Headers/smmintrin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h index c55e6dc65c49d..8913a196144bb 100644 --- a/clang/lib/Headers/smmintrin.h +++ b/clang/lib/Headers/smmintrin.h @@ -871,7 +871,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /* Miscellaneous insert and extract macros. */ /* Extract a single-precision float from X at index N into D. */ #define _MM_EXTRACT_FLOAT(D, X, N) \ - { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } + do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0) /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create an index suitable for _mm_insert_ps. */ From 1a0076db69c2df1a2c1e7452d7a90e2aa514300d Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Sat, 14 Aug 2021 16:26:06 -0700 Subject: [PATCH 046/700] [sanitizer] Fix format string --- .../sanitizer_common/sanitizer_allocator_size_class_map.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_size_class_map.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_size_class_map.h index c50d13303edec..361793f2490ac 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_size_class_map.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_size_class_map.h @@ -193,13 +193,13 @@ class SizeClassMap { uptr cached = MaxCachedHint(s) * s; if (i == kBatchClassID) d = p = l = 0; - Printf("c%02zd => s: %zd diff: +%zd %02zd%% l %zd " - "cached: %zd %zd; id %zd\n", - i, Size(i), d, p, l, MaxCachedHint(s), cached, ClassID(s)); + Printf( + "c%02zu => s: %zu diff: +%zu %02zu%% l %zu cached: %zu %zu; id %zu\n", + i, Size(i), d, p, l, MaxCachedHint(s), cached, ClassID(s)); total_cached += cached; prev_s = s; } - Printf("Total cached: %zd\n", total_cached); + Printf("Total cached: %zu\n", total_cached); } static void Validate() { From 45138f788c9b3c4ac5d9ae4479841c411c15190e Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Sat, 14 Aug 2021 16:51:10 -0700 Subject: [PATCH 047/700] [sanitizer] Define 32bit uptr as uint This makes it consistent with uintptr_t. --- compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h | 5 +++++ .../lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h index f7fdc160eeb1c..07b303e06a098 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h @@ -139,8 +139,13 @@ namespace __sanitizer { typedef unsigned long long uptr; typedef signed long long sptr; #else +# if (SANITIZER_WORDSIZE == 64) typedef unsigned long uptr; typedef signed long sptr; +# else +typedef unsigned int uptr; +typedef signed int sptr; +# endif #endif // defined(_WIN64) #if defined(__x86_64__) // Since x32 uses ILP32 data model in 64-bit hardware mode, we must use diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp index 670e96552c68f..385b6158300ca 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp @@ -71,7 +71,7 @@ void Print(const set &s) { #if defined(_WIN64) fprintf(stderr, "%llu ", *it); #else - fprintf(stderr, "%lu ", *it); + fprintf(stderr, "%zu ", *it); #endif } fprintf(stderr, "\n"); From f1de9d6dae174feb5000ad6a1b492b8cb717f5b6 Mon Sep 17 00:00:00 2001 From: "Wang, Pengfei" Date: Sun, 15 Aug 2021 08:17:30 +0800 Subject: [PATCH 048/700] [X86] AVX512FP16 instructions enabling 2/6 Enable FP16 binary operator instructions. Ref.: https://software.intel.com/content/www/us/en/develop/download/intel-avx512-fp16-architecture-specification.html Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D105264 --- clang/include/clang/Basic/BuiltinsX86.def | 35 + clang/lib/CodeGen/CGBuiltin.cpp | 23 +- clang/lib/Headers/avx512fp16intrin.h | 549 ++++++++ clang/lib/Headers/avx512vlfp16intrin.h | 290 +++++ clang/lib/Sema/SemaChecking.cpp | 14 + clang/test/CodeGen/X86/avx512fp16-builtins.c | 1067 ++++++++++++++++ .../test/CodeGen/X86/avx512vlfp16-builtins.c | 1112 +++++++++++++++++ llvm/include/llvm/IR/IntrinsicsX86.td | 113 ++ .../lib/Target/X86/AsmParser/X86AsmParser.cpp | 7 +- .../X86/MCTargetDesc/X86ATTInstPrinter.cpp | 16 + .../X86/MCTargetDesc/X86InstPrinterCommon.cpp | 18 + llvm/lib/Target/X86/X86ISelLowering.cpp | 24 + llvm/lib/Target/X86/X86InstrAVX512.td | 110 +- llvm/lib/Target/X86/X86InstrFoldTables.cpp | 107 ++ llvm/lib/Target/X86/X86InstrInfo.cpp | 45 + llvm/lib/Target/X86/X86IntrinsicsInfo.h | 28 + .../X86/avx512fp16-arith-intrinsics.ll | 284 +++++ .../X86/avx512fp16-arith-vl-intrinsics.ll | 404 ++++++ llvm/test/CodeGen/X86/avx512fp16-arith.ll | 355 ++++++ llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll | 129 ++ llvm/test/CodeGen/X86/avx512fp16-fminnum.ll | 129 ++ .../X86/avx512fp16-fold-load-binops.ll | 83 ++ .../CodeGen/X86/avx512fp16-fold-xmm-zero.ll | 34 + llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll | 381 ++++++ .../test/CodeGen/X86/avx512fp16-intrinsics.ll | 189 +++ .../X86/avx512fp16-machine-combiner.ll | 345 +++++ llvm/test/CodeGen/X86/avx512fp16-mov.ll | 41 + .../CodeGen/X86/avx512fp16-unsafe-fp-math.ll | 141 +++ .../CodeGen/X86/fp-strict-scalar-cmp-fp16.ll | 719 +++++++++++ .../test/CodeGen/X86/fp-strict-scalar-fp16.ll | 78 ++ .../CodeGen/X86/pseudo_cmov_lower-fp16.ll | 63 + .../X86/stack-folding-fp-avx512fp16.ll | 572 +++++++++ .../X86/stack-folding-fp-avx512fp16vl.ll | 148 +++ llvm/test/CodeGen/X86/vec-strict-128-fp16.ll | 54 + llvm/test/CodeGen/X86/vec-strict-256-fp16.ll | 54 + llvm/test/CodeGen/X86/vec-strict-512-fp16.ll | 54 + .../CodeGen/X86/vec-strict-cmp-128-fp16.ll | 1012 +++++++++++++++ .../CodeGen/X86/vec-strict-cmp-256-fp16.ll | 708 +++++++++++ .../CodeGen/X86/vec-strict-cmp-512-fp16.ll | 708 +++++++++++ .../CodeGen/X86/vector-reduce-fmax-nnan.ll | 43 +- .../CodeGen/X86/vector-reduce-fmin-nnan.ll | 43 +- llvm/test/MC/Disassembler/X86/avx512fp16.txt | 384 ++++++ .../test/MC/Disassembler/X86/avx512fp16vl.txt | 282 +++++ llvm/test/MC/X86/avx512fp16.s | 384 ++++++ llvm/test/MC/X86/avx512fp16vl.s | 281 +++++ llvm/test/MC/X86/intel-syntax-avx512fp16.s | 384 ++++++ llvm/test/MC/X86/intel-syntax-avx512fp16vl.s | 281 +++++ 47 files changed, 12282 insertions(+), 43 deletions(-) create mode 100644 llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll create mode 100644 llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll create mode 100644 llvm/test/CodeGen/X86/avx512fp16-arith.ll create mode 100644 llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll create mode 100644 llvm/test/CodeGen/X86/avx512fp16-fminnum.ll create mode 100644 llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll create mode 100644 llvm/test/CodeGen/X86/avx512fp16-fold-xmm-zero.ll create mode 100644 llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll create mode 100644 llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll create mode 100644 llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll create mode 100644 llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll create mode 100644 llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll create mode 100644 llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll create mode 100644 llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll create mode 100644 llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll create mode 100644 llvm/test/CodeGen/X86/vec-strict-128-fp16.ll create mode 100644 llvm/test/CodeGen/X86/vec-strict-256-fp16.ll create mode 100644 llvm/test/CodeGen/X86/vec-strict-512-fp16.ll create mode 100644 llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll create mode 100644 llvm/test/CodeGen/X86/vec-strict-cmp-256-fp16.ll create mode 100644 llvm/test/CodeGen/X86/vec-strict-cmp-512-fp16.ll create mode 100644 llvm/test/MC/Disassembler/X86/avx512fp16vl.txt create mode 100644 llvm/test/MC/X86/avx512fp16vl.s create mode 100644 llvm/test/MC/X86/intel-syntax-avx512fp16vl.s diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index c54133c449889..594415fe80692 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1850,6 +1850,29 @@ TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_256, "vV8iV8iUc*Uc*", "nV:256:", "a TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_128, "vV4iV4iUc*Uc*", "nV:128:", "avx512vp2intersect,avx512vl") // AVX512 fp16 intrinsics +TARGET_BUILTIN(__builtin_ia32_vcomish, "iV8xV8xIiIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_addph512, "V32xV32xV32xIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_subph512, "V32xV32xV32xIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_mulph512, "V32xV32xV32xIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_divph512, "V32xV32xV32xIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_maxph512, "V32xV32xV32xIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_minph512, "V32xV32xV32xIi", "ncV:512:", "avx512fp16") + +TARGET_BUILTIN(__builtin_ia32_minph256, "V16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_minph128, "V8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_maxph256, "V16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_maxph128, "V8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl") + +TARGET_BUILTIN(__builtin_ia32_addsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_divsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_mulsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_subsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_maxsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_minsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_cmpph512_mask, "UiV32xV32xIiUiIi", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_cmpph256_mask, "UsV16xV16xIiUs", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_cmpph128_mask, "UcV8xV8xIiUc", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_cmpsh_mask, "UcV8xV8xIiUcIi", "ncV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_loadsh128_mask, "V8xV8x*V8xUc", "nV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_storesh128_mask, "vV8x*V8xUc", "nV:128:", "avx512fp16") @@ -1886,12 +1909,24 @@ TARGET_BUILTIN(__builtin_ia32_reduce_and_d512, "iV16i", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_and_q512, "OiV8Oi", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fadd_pd512, "ddV8d", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ps512, "ffV16f", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph512, "xxV32x", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph256, "xxV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph128, "xxV8x", "ncV:128:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_reduce_fmax_pd512, "dV8d", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ps512, "fV16f", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph512, "xV32x", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph256, "xV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph128, "xV8x", "ncV:128:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_reduce_fmin_pd512, "dV8d", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ps512, "fV16f", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph512, "xV32x", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph256, "xV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph128, "xV8x", "ncV:128:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_reduce_fmul_pd512, "ddV8d", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ps512, "ffV16f", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph512, "xxV32x", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph256, "xxV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph128, "xxV8x", "ncV:128:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_reduce_mul_d512, "iV16i", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_mul_q512, "OiV8Oi", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_reduce_or_d512, "iV16i", "ncV:512:", "avx512f") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 46dbaf6ba88cd..536a0bae13afe 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14137,28 +14137,40 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(F, {Ops[0]}); } case X86::BI__builtin_ia32_reduce_fadd_pd512: - case X86::BI__builtin_ia32_reduce_fadd_ps512: { + case X86::BI__builtin_ia32_reduce_fadd_ps512: + case X86::BI__builtin_ia32_reduce_fadd_ph512: + case X86::BI__builtin_ia32_reduce_fadd_ph256: + case X86::BI__builtin_ia32_reduce_fadd_ph128: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType()); Builder.getFastMathFlags().setAllowReassoc(); return Builder.CreateCall(F, {Ops[0], Ops[1]}); } case X86::BI__builtin_ia32_reduce_fmul_pd512: - case X86::BI__builtin_ia32_reduce_fmul_ps512: { + case X86::BI__builtin_ia32_reduce_fmul_ps512: + case X86::BI__builtin_ia32_reduce_fmul_ph512: + case X86::BI__builtin_ia32_reduce_fmul_ph256: + case X86::BI__builtin_ia32_reduce_fmul_ph128: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType()); Builder.getFastMathFlags().setAllowReassoc(); return Builder.CreateCall(F, {Ops[0], Ops[1]}); } case X86::BI__builtin_ia32_reduce_fmax_pd512: - case X86::BI__builtin_ia32_reduce_fmax_ps512: { + case X86::BI__builtin_ia32_reduce_fmax_ps512: + case X86::BI__builtin_ia32_reduce_fmax_ph512: + case X86::BI__builtin_ia32_reduce_fmax_ph256: + case X86::BI__builtin_ia32_reduce_fmax_ph128: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType()); Builder.getFastMathFlags().setNoNaNs(); return Builder.CreateCall(F, {Ops[0]}); } case X86::BI__builtin_ia32_reduce_fmin_pd512: - case X86::BI__builtin_ia32_reduce_fmin_ps512: { + case X86::BI__builtin_ia32_reduce_fmin_ps512: + case X86::BI__builtin_ia32_reduce_fmin_ph512: + case X86::BI__builtin_ia32_reduce_fmin_ph256: + case X86::BI__builtin_ia32_reduce_fmin_ph128: { Function *F = CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType()); Builder.getFastMathFlags().setNoNaNs(); @@ -14422,6 +14434,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_cmpordps: case X86::BI__builtin_ia32_cmpordpd: return getVectorFCmpIR(CmpInst::FCMP_ORD, /*IsSignaling*/false); + case X86::BI__builtin_ia32_cmpph128_mask: + case X86::BI__builtin_ia32_cmpph256_mask: + case X86::BI__builtin_ia32_cmpph512_mask: case X86::BI__builtin_ia32_cmpps128_mask: case X86::BI__builtin_ia32_cmpps256_mask: case X86::BI__builtin_ia32_cmpps512_mask: diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h index 926033e20152c..58d7349c4905a 100644 --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -269,10 +269,539 @@ _mm512_zextph256_ph512(__m256h __a) { 29, 30, 31); } +#define _mm_comi_round_sh(A, B, P, R) \ + __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R)) + +#define _mm_comi_sh(A, B, pred) \ + _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION) + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A, + __m512h __B) { + return (__m512h)((__v32hf)__A + (__v32hf)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_add_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_add_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_add_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_add_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A, + __m512h __B) { + return (__m512h)((__v32hf)__A - (__v32hf)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_sub_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_sub_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_sub_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_sub_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A, + __m512h __B) { + return (__m512h)((__v32hf)__A * (__v32hf)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_mul_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_mul_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_mul_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_mul_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A, + __m512h __B) { + return (__m512h)((__v32hf)__A / (__v32hf)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_div_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_div_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_div_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_div_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A, + __m512h __B) { + return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_min_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_min_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_min_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_min_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A, + __m512h __B) { + return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_max_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_max_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_max_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_max_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) { return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A); } +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A, + __m128h __B) { + __A[0] += __B[0]; + return __A; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_add_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_add_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); +} + +#define _mm_add_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_addsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_add_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_addsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_add_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_addsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A, + __m128h __B) { + __A[0] -= __B[0]; + return __A; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_sub_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_sub_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); +} + +#define _mm_sub_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_subsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_sub_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_subsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_sub_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_subsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A, + __m128h __B) { + __A[0] *= __B[0]; + return __A; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_mul_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_mul_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); +} + +#define _mm_mul_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_mulsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_mul_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_mulsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_mul_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_mulsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A, + __m128h __B) { + __A[0] /= __B[0]; + return __A; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_div_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_div_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); +} + +#define _mm_div_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_divsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_div_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_divsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_div_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_divsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_minsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_minsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_min_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_minsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_min_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_minsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_min_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_minsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_maxsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_maxsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_max_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_maxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_max_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_maxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_max_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_maxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_cmp_round_ph_mask(A, B, P, R) \ + ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(P), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \ + ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(P), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_cmp_ph_mask(A, B, P) \ + _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_cmp_ph_mask(U, A, B, P) \ + _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) + +#define _mm_cmp_round_sh_mask(X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), (int)(P), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), (int)(P), \ + (__mmask8)(M), (int)(R))) + +#define _mm_cmp_sh_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpsh_mask( \ + (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_sh_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpsh_mask( \ + (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \ + _MM_FROUND_CUR_DIRECTION)) // loads with vmovsh: static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) { struct __mm_load_sh_struct { @@ -418,6 +947,26 @@ static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) { return __b[0]; } +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 +_mm512_reduce_add_ph(__m512h __W) { + return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 +_mm512_reduce_mul_ph(__m512h __W) { + return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_ph(__m512h __V) { + return __builtin_ia32_reduce_fmax_ph512(__V); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_ph(__m512h __V) { + return __builtin_ia32_reduce_fmin_ph512(__V); +} + static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W, diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h index cda54bcc8351d..0f23054e6fa10 100644 --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -69,6 +69,240 @@ _mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \ (h7), (h6), (h5), (h4), (h3), (h2), (h1)) +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A, + __m256h __B) { + return (__m256h)((__v16hf)__A + (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A, + __m128h __B) { + return (__m128h)((__v8hf)__A + (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A, + __m256h __B) { + return (__m256h)((__v16hf)__A - (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A, + __m128h __B) { + return (__m128h)((__v8hf)__A - (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A, + __m256h __B) { + return (__m256h)((__v16hf)__A * (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A, + __m128h __B) { + return (__m128h)((__v8hf)__A * (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A, + __m256h __B) { + return (__m256h)((__v16hf)__A / (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A, + __m128h __B) { + return (__m128h)((__v8hf)__A / (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A, + __m256h __B) { + return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), + (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A, + __m256h __B) { + return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), + (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)_mm_setzero_ph()); +} + static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) { return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A); } @@ -77,6 +311,22 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) { return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); } +#define _mm256_cmp_ph_mask(a, b, p) \ + ((__mmask16)__builtin_ia32_cmpph256_mask( \ + (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1)) + +#define _mm256_mask_cmp_ph_mask(m, a, b, p) \ + ((__mmask16)__builtin_ia32_cmpph256_mask( \ + (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m))) + +#define _mm_cmp_ph_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmpph128_mask( \ + (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1)) + +#define _mm_mask_cmp_ph_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmpph128_mask( \ + (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m))) + static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) { @@ -112,6 +362,46 @@ _mm256_permutexvar_ph(__m256i __A, __m256h __B) { return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); } +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 +_mm256_reduce_add_ph(__m256h __W) { + return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 +_mm256_reduce_mul_ph(__m256h __W) { + return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 +_mm256_reduce_max_ph(__m256h __V) { + return __builtin_ia32_reduce_fmax_ph256(__V); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 +_mm256_reduce_min_ph(__m256h __V) { + return __builtin_ia32_reduce_fmin_ph256(__V); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 +_mm_reduce_add_ph(__m128h __W) { + return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 +_mm_reduce_mul_ph(__m128h __W) { + return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 +_mm_reduce_max_ph(__m128h __V) { + return __builtin_ia32_reduce_fmax_ph128(__V); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 +_mm_reduce_min_ph(__m128h __V) { + return __builtin_ia32_reduce_fmin_ph128(__V); +} + #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index e5008330a4150..063fd38f97c46 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3884,6 +3884,8 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_maxps512: case X86::BI__builtin_ia32_minpd512: case X86::BI__builtin_ia32_minps512: + case X86::BI__builtin_ia32_maxph512: + case X86::BI__builtin_ia32_minph512: ArgNum = 2; break; case X86::BI__builtin_ia32_cvtps2pd512_mask: @@ -3905,6 +3907,7 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_rsqrt28ps_mask: case X86::BI__builtin_ia32_vcomisd: case X86::BI__builtin_ia32_vcomiss: + case X86::BI__builtin_ia32_vcomish: case X86::BI__builtin_ia32_vcvtph2ps512_mask: ArgNum = 3; break; @@ -3912,6 +3915,7 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_cmpps512_mask: case X86::BI__builtin_ia32_cmpsd_mask: case X86::BI__builtin_ia32_cmpss_mask: + case X86::BI__builtin_ia32_cmpsh_mask: case X86::BI__builtin_ia32_cvtss2sd_round_mask: case X86::BI__builtin_ia32_getexpsd128_round_mask: case X86::BI__builtin_ia32_getexpss128_round_mask: @@ -3919,8 +3923,10 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_getmantps512_mask: case X86::BI__builtin_ia32_maxsd_round_mask: case X86::BI__builtin_ia32_maxss_round_mask: + case X86::BI__builtin_ia32_maxsh_round_mask: case X86::BI__builtin_ia32_minsd_round_mask: case X86::BI__builtin_ia32_minss_round_mask: + case X86::BI__builtin_ia32_minsh_round_mask: case X86::BI__builtin_ia32_rcp28sd_round_mask: case X86::BI__builtin_ia32_rcp28ss_round_mask: case X86::BI__builtin_ia32_reducepd512_mask: @@ -3964,6 +3970,10 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { ArgNum = 1; HasRC = true; break; + case X86::BI__builtin_ia32_addph512: + case X86::BI__builtin_ia32_divph512: + case X86::BI__builtin_ia32_mulph512: + case X86::BI__builtin_ia32_subph512: case X86::BI__builtin_ia32_addpd512: case X86::BI__builtin_ia32_addps512: case X86::BI__builtin_ia32_divpd512: @@ -3999,12 +4009,16 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { ArgNum = 3; HasRC = true; break; + case X86::BI__builtin_ia32_addsh_round_mask: case X86::BI__builtin_ia32_addss_round_mask: case X86::BI__builtin_ia32_addsd_round_mask: + case X86::BI__builtin_ia32_divsh_round_mask: case X86::BI__builtin_ia32_divss_round_mask: case X86::BI__builtin_ia32_divsd_round_mask: + case X86::BI__builtin_ia32_mulsh_round_mask: case X86::BI__builtin_ia32_mulss_round_mask: case X86::BI__builtin_ia32_mulsd_round_mask: + case X86::BI__builtin_ia32_subsh_round_mask: case X86::BI__builtin_ia32_subss_round_mask: case X86::BI__builtin_ia32_subsd_round_mask: case X86::BI__builtin_ia32_scalefpd512_mask: diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c index f8a989afa871a..4f627daff7e6c 100644 --- a/clang/test/CodeGen/X86/avx512fp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c @@ -337,12 +337,1055 @@ __m512h test_mm512_zextph256_ph512(__m256h __a) { return _mm512_zextph256_ph512(__a); } +int test_mm_comi_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comi_round_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 0, i32 8) + return _mm_comi_round_sh(__A, __B, 0, _MM_FROUND_NO_EXC); +} + +int test_mm_comi_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comi_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 0, i32 4) + return _mm_comi_sh(__A, __B, 0); +} + +int test_mm_comieq_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comieq_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 16, i32 4) + return _mm_comieq_sh(__A, __B); +} + +int test_mm_comilt_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comilt_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 1, i32 4) + return _mm_comilt_sh(__A, __B); +} + +int test_mm_comile_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comile_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 2, i32 4) + return _mm_comile_sh(__A, __B); +} + +int test_mm_comigt_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comigt_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 14, i32 4) + return _mm_comigt_sh(__A, __B); +} + +int test_mm_comige_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comige_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 13, i32 4) + return _mm_comige_sh(__A, __B); +} + +int test_mm_comineq_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_comineq_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 20, i32 4) + return _mm_comineq_sh(__A, __B); +} + +int test_mm_ucomieq_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_ucomieq_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 0, i32 4) + return _mm_ucomieq_sh(__A, __B); +} + +int test_mm_ucomilt_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_ucomilt_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 17, i32 4) + return _mm_ucomilt_sh(__A, __B); +} + +int test_mm_ucomile_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_ucomile_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 18, i32 4) + return _mm_ucomile_sh(__A, __B); +} + +int test_mm_ucomigt_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_ucomigt_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 30, i32 4) + return _mm_ucomigt_sh(__A, __B); +} + +int test_mm_ucomige_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_ucomige_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 29, i32 4) + return _mm_ucomige_sh(__A, __B); +} + +int test_mm_ucomineq_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: test_mm_ucomineq_sh + // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 4, i32 4) + return _mm_ucomineq_sh(__A, __B); +} + +__m512h test_mm512_add_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_add_ph + // CHECK: %{{.*}} = fadd <32 x half> %{{.*}}, %{{.*}} + return _mm512_add_ph(__A, __B); +} + +__m512h test_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_add_ph + // CHECK: %{{.*}} = fadd <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return (__m512h)_mm512_mask_add_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_add_ph + // CHECK: %{{.*}} = fadd <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_add_ph(__U, __A, __B); +} + +__m512h test_mm512_add_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_add_round_ph + // CHECK: @llvm.x86.avx512fp16.add.ph.512 + return _mm512_add_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_mask_add_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_add_round_ph + // CHECK: @llvm.x86.avx512fp16.add.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_add_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_maskz_add_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_add_round_ph + // CHECK: @llvm.x86.avx512fp16.add.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_add_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_sub_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_sub_ph + // CHECK: %{{.*}} = fsub <32 x half> %{{.*}}, %{{.*}} + return _mm512_sub_ph(__A, __B); +} + +__m512h test_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_sub_ph + // CHECK: %{{.*}} = fsub <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return (__m512h)_mm512_mask_sub_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_sub_ph + // CHECK: %{{.*}} = fsub <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_sub_ph(__U, __A, __B); +} + +__m512h test_mm512_sub_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_sub_round_ph + // CHECK: @llvm.x86.avx512fp16.sub.ph.512 + return _mm512_sub_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_mask_sub_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_sub_round_ph + // CHECK: @llvm.x86.avx512fp16.sub.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_sub_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_maskz_sub_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_sub_round_ph + // CHECK: @llvm.x86.avx512fp16.sub.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_sub_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mul_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mul_ph + // CHECK: %{{.*}} = fmul <32 x half> %{{.*}}, %{{.*}} + return _mm512_mul_ph(__A, __B); +} + +__m512h test_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_mul_ph + // CHECK: %{{.*}} = fmul <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return (__m512h)_mm512_mask_mul_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_mul_ph + // CHECK: %{{.*}} = fmul <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_mul_ph(__U, __A, __B); +} + +__m512h test_mm512_mul_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mul_round_ph + // CHECK: @llvm.x86.avx512fp16.mul.ph.512 + return _mm512_mul_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_mask_mul_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_mul_round_ph + // CHECK: @llvm.x86.avx512fp16.mul.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_mul_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_maskz_mul_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_mul_round_ph + // CHECK: @llvm.x86.avx512fp16.mul.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_mul_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_div_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_div_ph + // CHECK: %{{.*}} = fdiv <32 x half> %{{.*}}, %{{.*}} + return _mm512_div_ph(__A, __B); +} + +__m512h test_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_div_ph + // CHECK: %{{.*}} = fdiv <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return (__m512h)_mm512_mask_div_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_div_ph + // CHECK: %{{.*}} = fdiv <32 x half> %{{.*}}, %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_div_ph(__U, __A, __B); +} + +__m512h test_mm512_div_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_div_round_ph + // CHECK: @llvm.x86.avx512fp16.div.ph.512 + return _mm512_div_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_mask_div_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_div_round_ph + // CHECK: @llvm.x86.avx512fp16.div.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_div_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} +__m512h test_mm512_maskz_div_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_div_round_ph + // CHECK: @llvm.x86.avx512fp16.div.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_div_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_min_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.512 + return _mm512_min_ph(__A, __B); +} + +__m512h test_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return (__m512h)_mm512_mask_min_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_min_ph(__U, __A, __B); +} + +__m512h test_mm512_min_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_min_round_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.512 + return _mm512_min_round_ph(__A, __B, _MM_FROUND_NO_EXC); +} +__m512h test_mm512_mask_min_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_min_round_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_min_round_ph(__W, __U, __A, __B, _MM_FROUND_NO_EXC); +} +__m512h test_mm512_maskz_min_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_min_round_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_min_round_ph(__U, __A, __B, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_max_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.512 + + return _mm512_max_ph(__A, __B); +} + +__m512h test_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return (__m512h)_mm512_mask_max_ph(__W, __U, __A, __B); +} + +__m512h test_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_max_ph(__U, __A, __B); +} + +__m512h test_mm512_max_round_ph(__m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_max_round_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.512 + return _mm512_max_round_ph(__A, __B, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_mask_max_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_mask_max_round_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_max_round_ph(__W, __U, __A, __B, _MM_FROUND_NO_EXC); +} + +__m512h test_mm512_maskz_max_round_ph(__mmask32 __U, __m512h __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_maskz_max_round_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.512 + // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_maskz_max_round_ph(__U, __A, __B, _MM_FROUND_NO_EXC); +} + __m512h test_mm512_abs_ph(__m512h a) { // CHECK-LABEL: @test_mm512_abs_ph // CHECK: and <16 x i32> return _mm512_abs_ph(a); } +__m128h test_mm_add_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_add_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round + return _mm_add_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_add_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_add_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round + return _mm_mask_add_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_maskz_add_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_add_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round + return _mm_maskz_add_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_add_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_add_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fadd half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_mask_add_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_add_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fadd half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_maskz_add_sh(__U, __A, __B); +} + +__m128h test_mm_add_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_add_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fadd half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + return _mm_add_sh(__A, __B); +} + +__m128h test_mm_sub_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_sub_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.sub.sh.round + return _mm_sub_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_sub_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_sub_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.sub.sh.round + return _mm_mask_sub_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_maskz_sub_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_sub_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.sub.sh.round + return _mm_maskz_sub_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_sub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_sub_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fsub half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_mask_sub_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_sub_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fsub half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_maskz_sub_sh(__U, __A, __B); +} + +__m128h test_mm_sub_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_sub_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fsub half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + return _mm_sub_sh(__A, __B); +} + +__m128h test_mm_mul_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mul_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.mul.sh.round + return _mm_mul_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_mul_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_mul_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.mul.sh.round + return _mm_mask_mul_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_maskz_mul_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_mul_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.mul.sh.round + return _mm_maskz_mul_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_mul_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_mul_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fmul half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_mask_mul_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_mul_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fmul half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_maskz_mul_sh(__U, __A, __B); +} + +__m128h test_mm_mul_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mul_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fmul half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + return _mm_mul_sh(__A, __B); +} + +__m128h test_mm_div_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_div_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.div.sh.round + return _mm_div_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_div_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_div_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.div.sh.round + return _mm_mask_div_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_maskz_div_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_div_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.div.sh.round + return _mm_maskz_div_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO); +} +__m128h test_mm_mask_div_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_div_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fdiv half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_mask_div_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_div_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fdiv half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}} + // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0 + return _mm_maskz_div_sh(__U, __A, __B); +} + +__m128h test_mm_div_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_div_sh + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: %{{.*}} = fdiv half %{{.*}}, %{{.*}} + // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + return _mm_div_sh(__A, __B); +} + +__m128h test_mm_min_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_min_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round + return _mm_min_round_sh(__A, __B, 0x08); +} +__m128h test_mm_mask_min_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_min_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round + return _mm_mask_min_round_sh(__W, __U, __A, __B, 0x08); +} +__m128h test_mm_maskz_min_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_min_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round + return _mm_maskz_min_round_sh(__U, __A, __B, 0x08); +} +__m128h test_mm_mask_min_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_min_sh + // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round + return _mm_mask_min_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_min_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_min_sh + // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round + return _mm_maskz_min_sh(__U, __A, __B); +} + +__m128h test_mm_min_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_min_sh + // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round + return _mm_min_sh(__A, __B); +} + +__m128h test_mm_max_round_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_max_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round + return _mm_max_round_sh(__A, __B, 0x08); +} +__m128h test_mm_mask_max_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_max_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round + return _mm_mask_max_round_sh(__W, __U, __A, __B, 0x08); +} +__m128h test_mm_maskz_max_round_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_max_round_sh + // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round + return _mm_maskz_max_round_sh(__U, __A, __B, 0x08); +} +__m128h test_mm_mask_max_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_max_sh + // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round + return _mm_mask_max_sh(__W, __U, __A, __B); +} +__m128h test_mm_maskz_max_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_max_sh + // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round + return _mm_maskz_max_sh(__U, __A, __B); +} + +__m128h test_mm_max_sh(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_max_sh + // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round + return _mm_max_sh(__A, __B); +} +__mmask32 test_mm512_cmp_round_ph_mask(__m512h a, __m512h b) { + // CHECK-LABEL: @test_mm512_cmp_round_ph_mask + // CHECK: fcmp oeq <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_round_ph_mask(a, b, 0, _MM_FROUND_NO_EXC); +} + +__mmask32 test_mm512_mask_cmp_round_ph_mask(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: @test_mm512_mask_cmp_round_ph_mask + // CHECK: [[CMP:%.*]] = fcmp oeq <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_round_ph_mask(m, a, b, 0, _MM_FROUND_NO_EXC); +} + +__mmask32 test_mm512_cmp_ph_mask_eq_oq(__m512h a, __m512h b) { + // CHECK-LABEL: @test_mm512_cmp_ph_mask_eq_oq + // CHECK: fcmp oeq <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_lt_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_lt_os + // CHECK: fcmp olt <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_LT_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_le_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_le_os + // CHECK: fcmp ole <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_LE_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_unord_q(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_unord_q + // CHECK: fcmp uno <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_UNORD_Q); +} + +__mmask32 test_mm512_cmp_ph_mask_neq_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_uq + // CHECK: fcmp une <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_nlt_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_nlt_us + // CHECK: fcmp uge <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NLT_US); +} + +__mmask32 test_mm512_cmp_ph_mask_nle_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_nle_us + // CHECK: fcmp ugt <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NLE_US); +} + +__mmask32 test_mm512_cmp_ph_mask_ord_q(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_ord_q + // CHECK: fcmp ord <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_ORD_Q); +} + +__mmask32 test_mm512_cmp_ph_mask_eq_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_eq_uq + // CHECK: fcmp ueq <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_EQ_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_nge_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_nge_us + // CHECK: fcmp ult <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NGE_US); +} + +__mmask32 test_mm512_cmp_ph_mask_ngt_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_ngt_us + // CHECK: fcmp ule <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NGT_US); +} + +__mmask32 test_mm512_cmp_ph_mask_false_oq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_false_oq + // CHECK: fcmp false <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_FALSE_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_neq_oq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_oq + // CHECK: fcmp one <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_ge_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_ge_os + // CHECK: fcmp oge <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_GE_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_gt_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_gt_os + // CHECK: fcmp ogt <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_GT_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_true_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_true_uq + // CHECK: fcmp true <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_TRUE_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_eq_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_eq_os + // CHECK: fcmp oeq <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_EQ_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_lt_oq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_lt_oq + // CHECK: fcmp olt <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_le_oq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_le_oq + // CHECK: fcmp ole <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_unord_s(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_unord_s + // CHECK: fcmp uno <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_UNORD_S); +} + +__mmask32 test_mm512_cmp_ph_mask_neq_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_us + // CHECK: fcmp une <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_US); +} + +__mmask32 test_mm512_cmp_ph_mask_nlt_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_nlt_uq + // CHECK: fcmp uge <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NLT_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_nle_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_nle_uq + // CHECK: fcmp ugt <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NLE_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_ord_s(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_ord_s + // CHECK: fcmp ord <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_ORD_S); +} + +__mmask32 test_mm512_cmp_ph_mask_eq_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_eq_us + // CHECK: fcmp ueq <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_EQ_US); +} + +__mmask32 test_mm512_cmp_ph_mask_nge_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_nge_uq + // CHECK: fcmp ult <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_ngt_uq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_ngt_uq + // CHECK: fcmp ule <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NGT_UQ); +} + +__mmask32 test_mm512_cmp_ph_mask_false_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_false_os + // CHECK: fcmp false <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_FALSE_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_neq_os(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_os + // CHECK: fcmp one <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_OS); +} + +__mmask32 test_mm512_cmp_ph_mask_ge_oq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_ge_oq + // CHECK: fcmp oge <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_GE_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_gt_oq(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_gt_oq + // CHECK: fcmp ogt <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_GT_OQ); +} + +__mmask32 test_mm512_cmp_ph_mask_true_us(__m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_cmp_ph_mask_true_us + // CHECK: fcmp true <32 x half> %{{.*}}, %{{.*}} + return _mm512_cmp_ph_mask(a, b, _CMP_TRUE_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_eq_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: @test_mm512_mask_cmp_ph_mask_eq_oq + // CHECK: [[CMP:%.*]] = fcmp oeq <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_lt_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_lt_os + // CHECK: [[CMP:%.*]] = fcmp olt <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LT_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_le_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_le_os + // CHECK: [[CMP:%.*]] = fcmp ole <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LE_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_unord_q(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_unord_q + // CHECK: [[CMP:%.*]] = fcmp uno <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_Q); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_neq_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_uq + // CHECK: [[CMP:%.*]] = fcmp une <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_nlt_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nlt_us + // CHECK: [[CMP:%.*]] = fcmp uge <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLT_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_nle_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nle_us + // CHECK: [[CMP:%.*]] = fcmp ugt <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLE_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_ord_q(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ord_q + // CHECK: [[CMP:%.*]] = fcmp ord <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_ORD_Q); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_eq_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_eq_uq + // CHECK: [[CMP:%.*]] = fcmp ueq <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_nge_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nge_us + // CHECK: [[CMP:%.*]] = fcmp ult <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGE_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_ngt_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ngt_us + // CHECK: [[CMP:%.*]] = fcmp ule <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGT_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_false_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_false_oq + // CHECK: [[CMP:%.*]] = fcmp false <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_neq_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_oq + // CHECK: [[CMP:%.*]] = fcmp one <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_ge_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ge_os + // CHECK: [[CMP:%.*]] = fcmp oge <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GE_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_gt_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_gt_os + // CHECK: [[CMP:%.*]] = fcmp ogt <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GT_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_true_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_true_uq + // CHECK: [[CMP:%.*]] = fcmp true <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_eq_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_eq_os + // CHECK: [[CMP:%.*]] = fcmp oeq <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_lt_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_lt_oq + // CHECK: [[CMP:%.*]] = fcmp olt <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LT_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_le_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_le_oq + // CHECK: [[CMP:%.*]] = fcmp ole <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LE_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_unord_s(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_unord_s + // CHECK: [[CMP:%.*]] = fcmp uno <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_S); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_neq_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_us + // CHECK: [[CMP:%.*]] = fcmp une <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_nlt_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nlt_uq + // CHECK: [[CMP:%.*]] = fcmp uge <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLT_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_nle_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nle_uq + // CHECK: [[CMP:%.*]] = fcmp ugt <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLE_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_ord_s(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ord_s + // CHECK: [[CMP:%.*]] = fcmp ord <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_ORD_S); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_eq_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_eq_us + // CHECK: [[CMP:%.*]] = fcmp ueq <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_US); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_nge_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nge_uq + // CHECK: [[CMP:%.*]] = fcmp ult <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGE_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_ngt_uq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ngt_uq + // CHECK: [[CMP:%.*]] = fcmp ule <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGT_UQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_false_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_false_os + // CHECK: [[CMP:%.*]] = fcmp false <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_neq_os(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_os + // CHECK: [[CMP:%.*]] = fcmp one <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OS); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_ge_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ge_oq + // CHECK: [[CMP:%.*]] = fcmp oge <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GE_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_gt_oq(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_gt_oq + // CHECK: [[CMP:%.*]] = fcmp ogt <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GT_OQ); +} + +__mmask32 test_mm512_mask_cmp_ph_mask_true_us(__mmask32 m, __m512h a, __m512h b) { + // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_true_us + // CHECK: [[CMP:%.*]] = fcmp true <32 x half> %{{.*}}, %{{.*}} + // CHECK: and <32 x i1> [[CMP]], {{.*}} + return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US); +} + +__mmask8 test_mm_cmp_round_sh_mask(__m128h __X, __m128h __Y) { + // CHECK-LABEL: @test_mm_cmp_round_sh_mask + // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh + return _mm_cmp_round_sh_mask(__X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC); +} + +__mmask8 test_mm_mask_cmp_round_sh_mask(__mmask8 __M, __m128h __X, __m128h __Y) { + // CHECK-LABEL: @test_mm_mask_cmp_round_sh_mask + // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh + return _mm_mask_cmp_round_sh_mask(__M, __X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC); +} + +__mmask8 test_mm_cmp_sh_mask(__m128h __X, __m128h __Y) { + // CHECK-LABEL: @test_mm_cmp_sh_mask + // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh + return _mm_cmp_sh_mask(__X, __Y, _CMP_NLT_US); +} + +__mmask8 test_mm_mask_cmp_sh_mask(__mmask8 __M, __m128h __X, __m128h __Y) { + // CHECK-LABEL: @test_mm_mask_cmp_sh_mask + // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh + return _mm_mask_cmp_sh_mask(__M, __X, __Y, _CMP_NLT_US); +} + // VMOVSH __m128h test_mm_load_sh(void const *A) { @@ -499,6 +1542,30 @@ __m128i test_mm_cvtsi16_si128(short A) { return _mm_cvtsi16_si128(A); } +_Float16 test_mm512_reduce_add_ph(__m512h __W) { + // CHECK-LABEL: @test_mm512_reduce_add_ph + // CHECK: call reassoc half @llvm.vector.reduce.fadd.v32f16(half 0xH8000, <32 x half> %{{.*}}) + return _mm512_reduce_add_ph(__W); +} + +_Float16 test_mm512_reduce_mul_ph(__m512h __W) { + // CHECK-LABEL: @test_mm512_reduce_mul_ph + // CHECK: call reassoc half @llvm.vector.reduce.fmul.v32f16(half 0xH3C00, <32 x half> %{{.*}}) + return _mm512_reduce_mul_ph(__W); +} + +_Float16 test_mm512_reduce_max_ph(__m512h __W) { + // CHECK-LABEL: @test_mm512_reduce_max_ph + // CHECK: call nnan half @llvm.vector.reduce.fmax.v32f16(<32 x half> %{{.*}}) + return _mm512_reduce_max_ph(__W); +} + +_Float16 test_mm512_reduce_min_ph(__m512h __W) { + // CHECK-LABEL: @test_mm512_reduce_min_ph + // CHECK: call nnan half @llvm.vector.reduce.fmin.v32f16(<32 x half> %{{.*}}) + return _mm512_reduce_min_ph(__W); +} + __m512h test_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { // CHECK-LABEL: @test_mm512_mask_blend_ph // CHECK: %{{.*}} = bitcast i32 %{{.*}} to <32 x i1> diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c index 187f81e348b84..a4e3b1e2be941 100644 --- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c @@ -139,6 +139,238 @@ __m256h test_mm256_setr_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16); } +__m256h test_mm256_add_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_add_ph + // CHECK: %{{.*}} = fadd <16 x half> %{{.*}}, %{{.*}} + return _mm256_add_ph(__A, __B); +} + +__m256h test_mm256_mask_add_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_add_ph + // CHECK: %{{.*}} = fadd <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return (__m256h)_mm256_mask_add_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_add_ph(__mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_add_ph + // CHECK: %{{.*}} = fadd <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_add_ph(__U, __A, __B); +} + +__m128h test_mm_add_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_add_ph + // CHECK: %{{.*}} = fadd <8 x half> %{{.*}}, %{{.*}} + return _mm_add_ph(__A, __B); +} + +__m128h test_mm_mask_add_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_add_ph + // CHECK: %{{.*}} = fadd <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return (__m128h)_mm_mask_add_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_add_ph(__mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_add_ph + // CHECK: %{{.*}} = fadd <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_add_ph(__U, __A, __B); +} + +__m256h test_mm256_sub_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_sub_ph + // CHECK: %{{.*}} = fsub <16 x half> %{{.*}}, %{{.*}} + return _mm256_sub_ph(__A, __B); +} + +__m256h test_mm256_mask_sub_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_sub_ph + // CHECK: %{{.*}} = fsub <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return (__m256h)_mm256_mask_sub_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_sub_ph(__mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_sub_ph + // CHECK: %{{.*}} = fsub <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_sub_ph(__U, __A, __B); +} + +__m128h test_mm_sub_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_sub_ph + // CHECK: %{{.*}} = fsub <8 x half> %{{.*}}, %{{.*}} + return _mm_sub_ph(__A, __B); +} + +__m128h test_mm_mask_sub_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_sub_ph + // CHECK: %{{.*}} = fsub <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return (__m128h)_mm_mask_sub_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_sub_ph(__mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_sub_ph + // CHECK: %{{.*}} = fsub <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_sub_ph(__U, __A, __B); +} + +__m256h test_mm256_mul_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mul_ph + // CHECK: %{{.*}} = fmul <16 x half> %{{.*}}, %{{.*}} + return _mm256_mul_ph(__A, __B); +} + +__m256h test_mm256_mask_mul_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_mul_ph + // CHECK: %{{.*}} = fmul <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return (__m256h)_mm256_mask_mul_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_mul_ph(__mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_mul_ph + // CHECK: %{{.*}} = fmul <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_mul_ph(__U, __A, __B); +} + +__m128h test_mm_mul_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mul_ph + // CHECK: %{{.*}} = fmul <8 x half> %{{.*}}, %{{.*}} + return _mm_mul_ph(__A, __B); +} + +__m128h test_mm_mask_mul_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_mul_ph + // CHECK: %{{.*}} = fmul <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return (__m128h)_mm_mask_mul_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_mul_ph(__mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_mul_ph + // CHECK: %{{.*}} = fmul <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_mul_ph(__U, __A, __B); +} + +__m256h test_mm256_div_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_div_ph + // CHECK: %{{.*}} = fdiv <16 x half> %{{.*}}, %{{.*}} + return _mm256_div_ph(__A, __B); +} + +__m256h test_mm256_mask_div_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_div_ph + // CHECK: %{{.*}} = fdiv <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return (__m256h)_mm256_mask_div_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_div_ph(__mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_div_ph + // CHECK: %{{.*}} = fdiv <16 x half> %{{.*}}, %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_maskz_div_ph(__U, __A, __B); +} + +__m128h test_mm_div_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_div_ph + // CHECK: %{{.*}} = fdiv <8 x half> %{{.*}}, %{{.*}} + return _mm_div_ph(__A, __B); +} + +__m128h test_mm_mask_div_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_div_ph + // CHECK: %{{.*}} = fdiv <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return (__m128h)_mm_mask_div_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_div_ph(__mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_div_ph + // CHECK: %{{.*}} = fdiv <8 x half> %{{.*}}, %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_maskz_div_ph(__U, __A, __B); +} + +__m256h test_mm256_min_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.256 + return _mm256_min_ph(__A, __B); +} + +__m256h test_mm256_mask_min_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.256 + return (__m256h)_mm256_mask_min_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_min_ph(__mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.256 + return _mm256_maskz_min_ph(__U, __A, __B); +} + +__m128h test_mm_min_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.128 + return _mm_min_ph(__A, __B); +} + +__m128h test_mm_mask_min_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.128 + return (__m128h)_mm_mask_min_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_min_ph(__mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_min_ph + // CHECK: @llvm.x86.avx512fp16.min.ph.128 + return _mm_maskz_min_ph(__U, __A, __B); +} + +__m256h test_mm256_max_ph(__m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.256 + return _mm256_max_ph(__A, __B); +} + +__m256h test_mm256_mask_max_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_mask_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.256 + return (__m256h)_mm256_mask_max_ph(__W, __U, __A, __B); +} + +__m256h test_mm256_maskz_max_ph(__mmask32 __U, __m256h __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_maskz_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.256 + return _mm256_maskz_max_ph(__U, __A, __B); +} + +__m128h test_mm_max_ph(__m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.128 + return _mm_max_ph(__A, __B); +} + +__m128h test_mm_mask_max_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.128 + return (__m128h)_mm_mask_max_ph(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_max_ph(__mmask32 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_max_ph + // CHECK: @llvm.x86.avx512fp16.max.ph.128 + return _mm_maskz_max_ph(__U, __A, __B); +} + __m128h test_mm_abs_ph(__m128h a) { // CHECK-LABEL: @test_mm_abs_ph // CHECK: and <4 x i32> @@ -151,6 +383,838 @@ __m256h test_mm256_abs_ph(__m256h a) { return _mm256_abs_ph(a); } +__mmask16 test_mm256_cmp_ph_mask_eq_oq(__m256h a, __m256h b) { + // CHECK-LABEL: @test_mm256_cmp_ph_mask_eq_oq + // CHECK: fcmp oeq <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_EQ_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_lt_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_lt_os + // CHECK: fcmp olt <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_LT_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_le_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_le_os + // CHECK: fcmp ole <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_LE_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_unord_q(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_unord_q + // CHECK: fcmp uno <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_UNORD_Q); +} + +__mmask16 test_mm256_cmp_ph_mask_neq_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_neq_uq + // CHECK: fcmp une <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NEQ_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_nlt_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_nlt_us + // CHECK: fcmp uge <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NLT_US); +} + +__mmask16 test_mm256_cmp_ph_mask_nle_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_nle_us + // CHECK: fcmp ugt <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NLE_US); +} + +__mmask16 test_mm256_cmp_ph_mask_ord_q(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_ord_q + // CHECK: fcmp ord <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_ORD_Q); +} + +__mmask16 test_mm256_cmp_ph_mask_eq_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_eq_uq + // CHECK: fcmp ueq <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_EQ_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_nge_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_nge_us + // CHECK: fcmp ult <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NGE_US); +} + +__mmask16 test_mm256_cmp_ph_mask_ngt_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_ngt_us + // CHECK: fcmp ule <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NGT_US); +} + +__mmask16 test_mm256_cmp_ph_mask_false_oq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_false_oq + // CHECK: fcmp false <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_FALSE_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_neq_oq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_neq_oq + // CHECK: fcmp one <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NEQ_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_ge_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_ge_os + // CHECK: fcmp oge <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_GE_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_gt_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_gt_os + // CHECK: fcmp ogt <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_GT_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_true_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_true_uq + // CHECK: fcmp true <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_TRUE_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_eq_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_eq_os + // CHECK: fcmp oeq <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_EQ_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_lt_oq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_lt_oq + // CHECK: fcmp olt <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_LT_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_le_oq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_le_oq + // CHECK: fcmp ole <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_LE_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_unord_s(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_unord_s + // CHECK: fcmp uno <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_UNORD_S); +} + +__mmask16 test_mm256_cmp_ph_mask_neq_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_neq_us + // CHECK: fcmp une <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NEQ_US); +} + +__mmask16 test_mm256_cmp_ph_mask_nlt_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_nlt_uq + // CHECK: fcmp uge <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NLT_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_nle_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_nle_uq + // CHECK: fcmp ugt <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NLE_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_ord_s(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_ord_s + // CHECK: fcmp ord <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_ORD_S); +} + +__mmask16 test_mm256_cmp_ph_mask_eq_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_eq_us + // CHECK: fcmp ueq <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_EQ_US); +} + +__mmask16 test_mm256_cmp_ph_mask_nge_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_nge_uq + // CHECK: fcmp ult <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NGE_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_ngt_uq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_ngt_uq + // CHECK: fcmp ule <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NGT_UQ); +} + +__mmask16 test_mm256_cmp_ph_mask_false_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_false_os + // CHECK: fcmp false <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_FALSE_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_neq_os(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_neq_os + // CHECK: fcmp one <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_NEQ_OS); +} + +__mmask16 test_mm256_cmp_ph_mask_ge_oq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_ge_oq + // CHECK: fcmp oge <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_GE_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_gt_oq(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_gt_oq + // CHECK: fcmp ogt <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_GT_OQ); +} + +__mmask16 test_mm256_cmp_ph_mask_true_us(__m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_cmp_ph_mask_true_us + // CHECK: fcmp true <16 x half> %{{.*}}, %{{.*}} + return _mm256_cmp_ph_mask(a, b, _CMP_TRUE_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_eq_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: @test_mm256_mask_cmp_ph_mask_eq_oq + // CHECK: [[CMP:%.*]] = fcmp oeq <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_lt_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_lt_os + // CHECK: [[CMP:%.*]] = fcmp olt <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_LT_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_le_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_le_os + // CHECK: [[CMP:%.*]] = fcmp ole <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_LE_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_unord_q(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_unord_q + // CHECK: [[CMP:%.*]] = fcmp uno <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_Q); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_neq_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_neq_uq + // CHECK: [[CMP:%.*]] = fcmp une <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_nlt_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nlt_us + // CHECK: [[CMP:%.*]] = fcmp uge <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NLT_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_nle_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nle_us + // CHECK: [[CMP:%.*]] = fcmp ugt <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NLE_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_ord_q(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ord_q + // CHECK: [[CMP:%.*]] = fcmp ord <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_ORD_Q); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_eq_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_eq_uq + // CHECK: [[CMP:%.*]] = fcmp ueq <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_EQ_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_nge_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nge_us + // CHECK: [[CMP:%.*]] = fcmp ult <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NGE_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_ngt_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ngt_us + // CHECK: [[CMP:%.*]] = fcmp ule <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NGT_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_false_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_false_oq + // CHECK: [[CMP:%.*]] = fcmp false <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_neq_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_neq_oq + // CHECK: [[CMP:%.*]] = fcmp one <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_ge_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ge_os + // CHECK: [[CMP:%.*]] = fcmp oge <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_GE_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_gt_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_gt_os + // CHECK: [[CMP:%.*]] = fcmp ogt <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_GT_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_true_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_true_uq + // CHECK: [[CMP:%.*]] = fcmp true <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_eq_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_eq_os + // CHECK: [[CMP:%.*]] = fcmp oeq <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_lt_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_lt_oq + // CHECK: [[CMP:%.*]] = fcmp olt <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_LT_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_le_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_le_oq + // CHECK: [[CMP:%.*]] = fcmp ole <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_LE_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_unord_s(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_unord_s + // CHECK: [[CMP:%.*]] = fcmp uno <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_S); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_neq_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_neq_us + // CHECK: [[CMP:%.*]] = fcmp une <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_nlt_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nlt_uq + // CHECK: [[CMP:%.*]] = fcmp uge <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NLT_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_nle_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nle_uq + // CHECK: [[CMP:%.*]] = fcmp ugt <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NLE_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_ord_s(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ord_s + // CHECK: [[CMP:%.*]] = fcmp ord <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_ORD_S); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_eq_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_eq_us + // CHECK: [[CMP:%.*]] = fcmp ueq <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_EQ_US); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_nge_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nge_uq + // CHECK: [[CMP:%.*]] = fcmp ult <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NGE_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_ngt_uq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ngt_uq + // CHECK: [[CMP:%.*]] = fcmp ule <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NGT_UQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_false_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_false_os + // CHECK: [[CMP:%.*]] = fcmp false <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_neq_os(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_neq_os + // CHECK: [[CMP:%.*]] = fcmp one <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OS); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_ge_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ge_oq + // CHECK: [[CMP:%.*]] = fcmp oge <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_GE_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_gt_oq(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_gt_oq + // CHECK: [[CMP:%.*]] = fcmp ogt <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_GT_OQ); +} + +__mmask16 test_mm256_mask_cmp_ph_mask_true_us(__mmask16 m, __m256h a, __m256h b) { + // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_true_us + // CHECK: [[CMP:%.*]] = fcmp true <16 x half> %{{.*}}, %{{.*}} + // CHECK: and <16 x i1> [[CMP]], {{.*}} + return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US); +} + +__mmask8 test_mm_cmp_ph_mask_eq_oq(__m128h a, __m128h b) { + // CHECK-LABEL: @test_mm_cmp_ph_mask_eq_oq + // CHECK: fcmp oeq <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_EQ_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_lt_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_lt_os + // CHECK: fcmp olt <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_LT_OS); +} + +__mmask8 test_mm_cmp_ph_mask_le_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_le_os + // CHECK: fcmp ole <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_LE_OS); +} + +__mmask8 test_mm_cmp_ph_mask_unord_q(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_unord_q + // CHECK: fcmp uno <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_UNORD_Q); +} + +__mmask8 test_mm_cmp_ph_mask_neq_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_neq_uq + // CHECK: fcmp une <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NEQ_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_nlt_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_nlt_us + // CHECK: fcmp uge <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NLT_US); +} + +__mmask8 test_mm_cmp_ph_mask_nle_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_nle_us + // CHECK: fcmp ugt <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NLE_US); +} + +__mmask8 test_mm_cmp_ph_mask_ord_q(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_ord_q + // CHECK: fcmp ord <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_ORD_Q); +} + +__mmask8 test_mm_cmp_ph_mask_eq_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_eq_uq + // CHECK: fcmp ueq <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_EQ_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_nge_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_nge_us + // CHECK: fcmp ult <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NGE_US); +} + +__mmask8 test_mm_cmp_ph_mask_ngt_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_ngt_us + // CHECK: fcmp ule <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NGT_US); +} + +__mmask8 test_mm_cmp_ph_mask_false_oq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_false_oq + // CHECK: fcmp false <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_FALSE_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_neq_oq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_neq_oq + // CHECK: fcmp one <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NEQ_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_ge_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_ge_os + // CHECK: fcmp oge <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_GE_OS); +} + +__mmask8 test_mm_cmp_ph_mask_gt_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_gt_os + // CHECK: fcmp ogt <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_GT_OS); +} + +__mmask8 test_mm_cmp_ph_mask_true_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_true_uq + // CHECK: fcmp true <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_TRUE_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_eq_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_eq_os + // CHECK: fcmp oeq <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_EQ_OS); +} + +__mmask8 test_mm_cmp_ph_mask_lt_oq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_lt_oq + // CHECK: fcmp olt <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_LT_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_le_oq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_le_oq + // CHECK: fcmp ole <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_LE_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_unord_s(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_unord_s + // CHECK: fcmp uno <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_UNORD_S); +} + +__mmask8 test_mm_cmp_ph_mask_neq_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_neq_us + // CHECK: fcmp une <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NEQ_US); +} + +__mmask8 test_mm_cmp_ph_mask_nlt_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_nlt_uq + // CHECK: fcmp uge <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NLT_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_nle_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_nle_uq + // CHECK: fcmp ugt <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NLE_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_ord_s(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_ord_s + // CHECK: fcmp ord <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_ORD_S); +} + +__mmask8 test_mm_cmp_ph_mask_eq_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_eq_us + // CHECK: fcmp ueq <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_EQ_US); +} + +__mmask8 test_mm_cmp_ph_mask_nge_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_nge_uq + // CHECK: fcmp ult <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NGE_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_ngt_uq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_ngt_uq + // CHECK: fcmp ule <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NGT_UQ); +} + +__mmask8 test_mm_cmp_ph_mask_false_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_false_os + // CHECK: fcmp false <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_FALSE_OS); +} + +__mmask8 test_mm_cmp_ph_mask_neq_os(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_neq_os + // CHECK: fcmp one <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_NEQ_OS); +} + +__mmask8 test_mm_cmp_ph_mask_ge_oq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_ge_oq + // CHECK: fcmp oge <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_GE_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_gt_oq(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_gt_oq + // CHECK: fcmp ogt <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_GT_OQ); +} + +__mmask8 test_mm_cmp_ph_mask_true_us(__m128h a, __m128h b) { + // CHECK-LABEL: test_mm_cmp_ph_mask_true_us + // CHECK: fcmp true <8 x half> %{{.*}}, %{{.*}} + return _mm_cmp_ph_mask(a, b, _CMP_TRUE_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_eq_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: @test_mm_mask_cmp_ph_mask_eq_oq + // CHECK: [[CMP:%.*]] = fcmp oeq <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_lt_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_lt_os + // CHECK: [[CMP:%.*]] = fcmp olt <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_LT_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_le_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_le_os + // CHECK: [[CMP:%.*]] = fcmp ole <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_LE_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_unord_q(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_unord_q + // CHECK: [[CMP:%.*]] = fcmp uno <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_Q); +} + +__mmask8 test_mm_mask_cmp_ph_mask_neq_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_neq_uq + // CHECK: [[CMP:%.*]] = fcmp une <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_nlt_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nlt_us + // CHECK: [[CMP:%.*]] = fcmp uge <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NLT_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_nle_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nle_us + // CHECK: [[CMP:%.*]] = fcmp ugt <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NLE_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_ord_q(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ord_q + // CHECK: [[CMP:%.*]] = fcmp ord <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_ORD_Q); +} + +__mmask8 test_mm_mask_cmp_ph_mask_eq_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_eq_uq + // CHECK: [[CMP:%.*]] = fcmp ueq <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_EQ_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_nge_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nge_us + // CHECK: [[CMP:%.*]] = fcmp ult <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NGE_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_ngt_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ngt_us + // CHECK: [[CMP:%.*]] = fcmp ule <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NGT_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_false_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_false_oq + // CHECK: [[CMP:%.*]] = fcmp false <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_neq_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_neq_oq + // CHECK: [[CMP:%.*]] = fcmp one <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_ge_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ge_os + // CHECK: [[CMP:%.*]] = fcmp oge <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_GE_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_gt_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_gt_os + // CHECK: [[CMP:%.*]] = fcmp ogt <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_GT_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_true_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_true_uq + // CHECK: [[CMP:%.*]] = fcmp true <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_eq_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_eq_os + // CHECK: [[CMP:%.*]] = fcmp oeq <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_lt_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_lt_oq + // CHECK: [[CMP:%.*]] = fcmp olt <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_LT_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_le_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_le_oq + // CHECK: [[CMP:%.*]] = fcmp ole <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_LE_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_unord_s(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_unord_s + // CHECK: [[CMP:%.*]] = fcmp uno <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_S); +} + +__mmask8 test_mm_mask_cmp_ph_mask_neq_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_neq_us + // CHECK: [[CMP:%.*]] = fcmp une <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_nlt_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nlt_uq + // CHECK: [[CMP:%.*]] = fcmp uge <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NLT_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_nle_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nle_uq + // CHECK: [[CMP:%.*]] = fcmp ugt <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NLE_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_ord_s(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ord_s + // CHECK: [[CMP:%.*]] = fcmp ord <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_ORD_S); +} + +__mmask8 test_mm_mask_cmp_ph_mask_eq_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_eq_us + // CHECK: [[CMP:%.*]] = fcmp ueq <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_EQ_US); +} + +__mmask8 test_mm_mask_cmp_ph_mask_nge_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nge_uq + // CHECK: [[CMP:%.*]] = fcmp ult <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NGE_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_ngt_uq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ngt_uq + // CHECK: [[CMP:%.*]] = fcmp ule <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NGT_UQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_false_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_false_os + // CHECK: [[CMP:%.*]] = fcmp false <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_neq_os(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_neq_os + // CHECK: [[CMP:%.*]] = fcmp one <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OS); +} + +__mmask8 test_mm_mask_cmp_ph_mask_ge_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ge_oq + // CHECK: [[CMP:%.*]] = fcmp oge <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_GE_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_gt_oq(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_gt_oq + // CHECK: [[CMP:%.*]] = fcmp ogt <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_GT_OQ); +} + +__mmask8 test_mm_mask_cmp_ph_mask_true_us(__mmask8 m, __m128h a, __m128h b) { + // CHECK-LABEL: test_mm_mask_cmp_ph_mask_true_us + // CHECK: [[CMP:%.*]] = fcmp true <8 x half> %{{.*}}, %{{.*}} + // CHECK: and <8 x i1> [[CMP]], {{.*}} + return _mm_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US); +} + __m128h test_mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) { // CHECK-LABEL: @test_mm_mask_blend_ph // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> @@ -202,3 +1266,51 @@ __m256h test_mm256_permutexvar_ph(__m256i __A, __m256h __B) { // CHECK: %{{.*}} = bitcast <16 x i16> %{{.*}} to <16 x half> return _mm256_permutexvar_ph(__A, __B); } + +_Float16 test_mm256_reduce_add_ph(__m256h __W) { + // CHECK-LABEL: @test_mm256_reduce_add_ph + // CHECK: call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> %{{.*}}) + return _mm256_reduce_add_ph(__W); +} + +_Float16 test_mm256_reduce_mul_ph(__m256h __W) { + // CHECK-LABEL: @test_mm256_reduce_mul_ph + // CHECK: call reassoc half @llvm.vector.reduce.fmul.v16f16(half 0xH3C00, <16 x half> %{{.*}}) + return _mm256_reduce_mul_ph(__W); +} + +_Float16 test_mm256_reduce_max_ph(__m256h __W) { + // CHECK-LABEL: @test_mm256_reduce_max_ph + // CHECK: call nnan half @llvm.vector.reduce.fmax.v16f16(<16 x half> %{{.*}}) + return _mm256_reduce_max_ph(__W); +} + +_Float16 test_mm256_reduce_min_ph(__m256h __W) { + // CHECK-LABEL: @test_mm256_reduce_min_ph + // CHECK: call nnan half @llvm.vector.reduce.fmin.v16f16(<16 x half> %{{.*}}) + return _mm256_reduce_min_ph(__W); +} + +_Float16 test_mm_reduce_add_ph(__m128h __W) { + // CHECK-LABEL: @test_mm_reduce_add_ph + // CHECK: call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> %{{.*}}) + return _mm_reduce_add_ph(__W); +} + +_Float16 test_mm_reduce_mul_ph(__m128h __W) { + // CHECK-LABEL: @test_mm_reduce_mul_ph + // CHECK: call reassoc half @llvm.vector.reduce.fmul.v8f16(half 0xH3C00, <8 x half> %{{.*}}) + return _mm_reduce_mul_ph(__W); +} + +_Float16 test_mm_reduce_min_ph(__m128h __W) { + // CHECK-LABEL: @test_mm_reduce_min_ph + // CHECK: call nnan half @llvm.vector.reduce.fmin.v8f16(<8 x half> %{{.*}}) + return _mm_reduce_min_ph(__W); +} + +_Float16 test_mm_reduce_max_ph(__m128h __W) { + // CHECK-LABEL: @test_mm_reduce_max_ph + // CHECK: call nnan half @llvm.vector.reduce.fmax.v8f16(<8 x half> %{{.*}}) + return _mm_reduce_max_ph(__W); +} diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 5848356b5b1a1..ae0a416175f9e 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5108,3 +5108,116 @@ let TargetPrefix = "x86" in { def int_x86_senduipi : GCCBuiltin<"__builtin_ia32_senduipi">, Intrinsic<[], [llvm_i64_ty], []>; } + +//===----------------------------------------------------------------------===// +// avx512_fp16: vaddph +let TargetPrefix = "x86" in { + def int_x86_avx512fp16_add_ph_512 + : GCCBuiltin<"__builtin_ia32_addph512">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_sub_ph_512 + : GCCBuiltin<"__builtin_ia32_subph512">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mul_ph_512 + : GCCBuiltin<"__builtin_ia32_mulph512">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_div_ph_512 + : GCCBuiltin<"__builtin_ia32_divph512">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_max_ph_128 + : GCCBuiltin<"__builtin_ia32_maxph128">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_max_ph_256 + : GCCBuiltin<"__builtin_ia32_maxph256">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f16_ty, llvm_v16f16_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_max_ph_512 + : GCCBuiltin<"__builtin_ia32_maxph512">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_min_ph_128 + : GCCBuiltin<"__builtin_ia32_minph128">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_min_ph_256 + : GCCBuiltin<"__builtin_ia32_minph256">, + Intrinsic<[ llvm_v16f16_ty ], + [ llvm_v16f16_ty, llvm_v16f16_ty ], [ IntrNoMem ]>; + def int_x86_avx512fp16_min_ph_512 + : GCCBuiltin<"__builtin_ia32_minph512">, + Intrinsic<[ llvm_v32f16_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + + def int_x86_avx512fp16_mask_cmp_ph_512 + : Intrinsic<[ llvm_v32i1_ty ], + [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_v32i1_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg>, ImmArg> ]>; + def int_x86_avx512fp16_mask_cmp_ph_256 + : Intrinsic<[ llvm_v16i1_ty ], + [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i32_ty, llvm_v16i1_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_cmp_ph_128 + : Intrinsic<[ llvm_v8i1_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_v8i1_ty ], + [ IntrNoMem, ImmArg> ]>; + + def int_x86_avx512fp16_mask_add_sh_round + : GCCBuiltin<"__builtin_ia32_addsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_sub_sh_round + : GCCBuiltin<"__builtin_ia32_subsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_mul_sh_round + : GCCBuiltin<"__builtin_ia32_mulsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_div_sh_round + : GCCBuiltin<"__builtin_ia32_divsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_min_sh_round + : GCCBuiltin<"__builtin_ia32_minsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_max_sh_round + : GCCBuiltin<"__builtin_ia32_maxsh_round_mask">, + Intrinsic<[ llvm_v8f16_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg> ]>; + def int_x86_avx512fp16_mask_cmp_sh + : GCCBuiltin<"__builtin_ia32_cmpsh_mask">, + Intrinsic<[ llvm_i8_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_i8_ty, + llvm_i32_ty ], + [ IntrNoMem, ImmArg>, ImmArg> ]>; + def int_x86_avx512fp16_vcomi_sh + : GCCBuiltin<"__builtin_ia32_vcomish">, + Intrinsic<[ llvm_i32_ty ], + [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ], + [ IntrNoMem, ImmArg>, ImmArg> ]>; +} diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index b54b7adfff989..6cd64ad2592a8 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -3127,9 +3127,10 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, unsigned ComparisonPredicate = ~0U; - // FIXME: Hack to recognize cmp{ss,sd,ps,pd}. + // FIXME: Hack to recognize cmp{sh,ss,sd,ph,ps,pd}. if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && (PatchedName.endswith("ss") || PatchedName.endswith("sd") || + PatchedName.endswith("sh") || PatchedName.endswith("ph") || PatchedName.endswith("ps") || PatchedName.endswith("pd"))) { bool IsVCMP = PatchedName[0] == 'v'; unsigned CCIdx = IsVCMP ? 4 : 3; @@ -3192,6 +3193,10 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, PatchedName = IsVCMP ? "vcmpps" : "cmpps"; else if (PatchedName.endswith("pd")) PatchedName = IsVCMP ? "vcmppd" : "cmppd"; + else if (PatchedName.endswith("sh")) + PatchedName = "vcmpsh"; + else if (PatchedName.endswith("ph")) + PatchedName = "vcmpph"; else llvm_unreachable("Unexpected suffix!"); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp index 30f279cc94a51..ef0b5c1fe11c8 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -153,6 +153,20 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI, case X86::VCMPPSZrrib: case X86::VCMPPSZrribk: case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk: case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk: + case X86::VCMPPHZ128rmi: case X86::VCMPPHZ128rri: + case X86::VCMPPHZ256rmi: case X86::VCMPPHZ256rri: + case X86::VCMPPHZrmi: case X86::VCMPPHZrri: + case X86::VCMPSHZrm: case X86::VCMPSHZrr: + case X86::VCMPSHZrm_Int: case X86::VCMPSHZrr_Int: + case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik: + case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik: + case X86::VCMPPHZrmik: case X86::VCMPPHZrrik: + case X86::VCMPSHZrm_Intk: case X86::VCMPSHZrr_Intk: + case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik: + case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik: + case X86::VCMPPHZrmbi: case X86::VCMPPHZrmbik: + case X86::VCMPPHZrrib: case X86::VCMPPHZrribk: + case X86::VCMPSHZrrb_Int: case X86::VCMPSHZrrb_Intk: if (Imm >= 0 && Imm <= 31) { OS << '\t'; printCMPMnemonic(MI, /*IsVCMP*/true, OS); @@ -176,6 +190,8 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI, NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8; else NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4; + if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) + NumElts *= 2; OS << "{1to" << NumElts << "}"; } else { if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index d8dbbbbf27790..167580ec1ed00 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -264,6 +264,24 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp, case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk: OS << "ss\t"; break; + case X86::VCMPPHZ128rmi: case X86::VCMPPHZ128rri: + case X86::VCMPPHZ256rmi: case X86::VCMPPHZ256rri: + case X86::VCMPPHZrmi: case X86::VCMPPHZrri: + case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik: + case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik: + case X86::VCMPPHZrmik: case X86::VCMPPHZrrik: + case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik: + case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik: + case X86::VCMPPHZrmbi: case X86::VCMPPHZrmbik: + case X86::VCMPPHZrrib: case X86::VCMPPHZrribk: + OS << "ph\t"; + break; + case X86::VCMPSHZrm: case X86::VCMPSHZrr: + case X86::VCMPSHZrm_Int: case X86::VCMPSHZrr_Int: + case X86::VCMPSHZrrb_Int: case X86::VCMPSHZrrb_Intk: + case X86::VCMPSHZrm_Intk: case X86::VCMPSHZrr_Intk: + OS << "sh\t"; + break; } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6614978b25d1a..319c4eeb4ed9a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1900,6 +1900,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) { auto setGroup = [&] (MVT VT) { + setOperationAction(ISD::FADD, VT, Legal); + setOperationAction(ISD::STRICT_FADD, VT, Legal); + setOperationAction(ISD::FSUB, VT, Legal); + setOperationAction(ISD::STRICT_FSUB, VT, Legal); + setOperationAction(ISD::FMUL, VT, Legal); + setOperationAction(ISD::STRICT_FMUL, VT, Legal); + setOperationAction(ISD::FDIV, VT, Legal); + setOperationAction(ISD::STRICT_FDIV, VT, Legal); + setOperationAction(ISD::LOAD, VT, Legal); setOperationAction(ISD::STORE, VT, Legal); @@ -1917,6 +1926,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // AVX512_FP16 scalar operations setGroup(MVT::f16); addRegisterClass(MVT::f16, &X86::FR16XRegClass); + setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); + setOperationAction(ISD::BR_CC, MVT::f16, Expand); + setOperationAction(ISD::SETCC, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); + + setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand); + setCondCodeAction(ISD::SETUNE, MVT::f16, Expand); if (Subtarget.useAVX512Regs()) { setGroup(MVT::v32f16); @@ -1930,6 +1947,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal); + + setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom); } if (Subtarget.hasVLX()) { @@ -47951,6 +47971,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); if (!((Subtarget.hasSSE1() && VT == MVT::f32) || (Subtarget.hasSSE2() && VT == MVT::f64) || + (Subtarget.hasFP16() && VT == MVT::f16) || (VT.isVector() && TLI.isTypeLegal(VT)))) return SDValue(); @@ -48512,6 +48533,9 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64) return SDValue(); + // We don't have CMPP Instruction for vxf16 + if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16) + return SDValue(); // We can only do this if the vector size in 256 bits or less. unsigned Size = VT.getSizeInBits(); if (Size > 256 && Subtarget.useAVX512Regs()) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 08f51c3b4b341..9672e3835f469 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2169,6 +2169,10 @@ let Predicates = [HasAVX512] in { X86cmpms_su, X86cmpmsSAE_su, SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W; } +let Predicates = [HasFP16], ExeDomain = SSEPackedSingle in + defm VCMPSHZ : avx512_cmp_scalar, AVX512XSIi8Base, TA; multiclass avx512_icmp_packed opc, string OpcodeStr, X86FoldableSchedWrite sched, @@ -2631,13 +2635,14 @@ multiclass avx512_vcmp_sae { EVEX_B, Sched<[sched]>; } -multiclass avx512_vcmp { - let Predicates = [HasAVX512] in { +multiclass avx512_vcmp { + let Predicates = [Pred] in { defm Z : avx512_vcmp_common, avx512_vcmp_sae, EVEX_V512; } - let Predicates = [HasAVX512,HasVLX] in { + let Predicates = [Pred,HasVLX] in { defm Z128 : avx512_vcmp_common, EVEX_V128; defm Z256 : avx512_vcmp_common, EVEX_V256; } @@ -2659,6 +2664,13 @@ let Predicates = [HasAVX512] in { (VCMPSSZrm FR32X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; } +defm VCMPPH : avx512_vcmp, + AVX512PSIi8Base, EVEX_4V, EVEX_CD8<16, CD8VF>, TA; +let Predicates = [HasFP16] in { + def : Pat<(v1i1 (X86cmpms(loadf16 addr:$src2), FR16X:$src1, CommutableCMPCC:$cc)), + (VCMPSHZrm FR16X:$src1, addr:$src2, imm:$cc)>; +} + // ---------------------------------------------------------------- // FPClass @@ -4152,7 +4164,7 @@ defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>, VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info, - [HasFP16, OptForSize]>, + [HasFP16]>, VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>; multiclass avx512_move_scalar_lowering opc, string OpcodeStr, SDPatternOperator avx512_fp_scalar_round, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; + let Predicates = [HasFP16] in + defm SHZ : avx512_fp_scalar, + avx512_fp_scalar_round, + T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>; } multiclass avx512_binop_s_sae opc, string OpcodeStr, SDNode OpNode, @@ -5647,6 +5665,13 @@ multiclass avx512_binop_s_sae opc, string OpcodeStr, SDNode OpNode, VecNode, SaeNode, sched.PD.Scl, IsCommutable, NAME#"SD">, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; + let Predicates = [HasFP16] in { + defm SHZ : avx512_fp_scalar_sae, + T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, + NotEVEX2VEXConvertible; + } } defm VADD : avx512_binop_s_round<0x58, "vadd", any_fadd, X86fadds, X86faddRnds, SchedWriteFAddSizes, 1>; @@ -5702,6 +5727,15 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, SIMD_EXC; +defm VMINCSHZ : avx512_comutable_binop_s<0x5D, "vminsh", f16x_info, X86fminc, + SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5XS, + EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC, + NotEVEX2VEXConvertible; +defm VMAXCSHZ : avx512_comutable_binop_s<0x5F, "vmaxsh", f16x_info, X86fmaxc, + SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5XS, + EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC, + NotEVEX2VEXConvertible; + multiclass avx512_fp_packed opc, string OpcodeStr, SDPatternOperator OpNode, SDPatternOperator MaskOpNode, X86VectorVTInfo _, X86FoldableSchedWrite sched, @@ -5789,9 +5823,33 @@ multiclass avx512_fp_binop_p opc, string OpcodeStr, SDPatternOperator Op } } +multiclass avx512_fp_binop_ph opc, string OpcodeStr, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, + X86SchedWriteSizes sched, bit IsCommutable = 0, + bit IsPD128Commutable = IsCommutable> { + let Predicates = [HasFP16] in { + defm PHZ : avx512_fp_packed, EVEX_V512, T_MAP5PS, + EVEX_CD8<16, CD8VF>; + } + let Predicates = [HasVLX, HasFP16] in { + defm PHZ128 : avx512_fp_packed, EVEX_V128, T_MAP5PS, + EVEX_CD8<16, CD8VF>; + defm PHZ256 : avx512_fp_packed, EVEX_V256, T_MAP5PS, + EVEX_CD8<16, CD8VF>; + } +} + let Uses = [MXCSR] in multiclass avx512_fp_binop_p_round opc, string OpcodeStr, SDNode OpNodeRnd, X86SchedWriteSizes sched> { + let Predicates = [HasFP16] in { + defm PHZ : avx512_fp_round_packed, + EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>; + } defm PSZ : avx512_fp_round_packed, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; @@ -5803,6 +5861,11 @@ multiclass avx512_fp_binop_p_round opc, string OpcodeStr, SDNode OpNodeR let Uses = [MXCSR] in multiclass avx512_fp_binop_p_sae opc, string OpcodeStr, SDNode OpNodeRnd, X86SchedWriteSizes sched> { + let Predicates = [HasFP16] in { + defm PHZ : avx512_fp_sae_packed, + EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>; + } defm PSZ : avx512_fp_sae_packed, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; @@ -5813,26 +5876,36 @@ multiclass avx512_fp_binop_p_sae opc, string OpcodeStr, SDNode OpNodeRnd defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, fadd, HasAVX512, SchedWriteFAddSizes, 1>, + avx512_fp_binop_ph<0x58, "vadd", any_fadd, fadd, SchedWriteFAddSizes, 1>, avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>; defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, fmul, HasAVX512, SchedWriteFMulSizes, 1>, + avx512_fp_binop_ph<0x59, "vmul", any_fmul, fmul, SchedWriteFMulSizes, 1>, avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>; defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, fsub, HasAVX512, SchedWriteFAddSizes>, + avx512_fp_binop_ph<0x5C, "vsub", any_fsub, fsub, SchedWriteFAddSizes>, avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>; defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, fdiv, HasAVX512, SchedWriteFDivSizes>, + avx512_fp_binop_ph<0x5E, "vdiv", any_fdiv, fdiv, SchedWriteFDivSizes>, avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>; defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, X86fmin, HasAVX512, SchedWriteFCmpSizes, 0>, + avx512_fp_binop_ph<0x5D, "vmin", X86fmin, X86fmin, SchedWriteFCmpSizes, 0>, avx512_fp_binop_p_sae<0x5D, "vmin", X86fminSAE, SchedWriteFCmpSizes>; defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, X86fmax, HasAVX512, SchedWriteFCmpSizes, 0>, + avx512_fp_binop_ph<0x5F, "vmax", X86fmax, X86fmax, SchedWriteFCmpSizes, 0>, avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxSAE, SchedWriteFCmpSizes>; let isCodeGenOnly = 1 in { defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, X86fminc, HasAVX512, + SchedWriteFCmpSizes, 1>, + avx512_fp_binop_ph<0x5D, "vmin", X86fminc, X86fminc, SchedWriteFCmpSizes, 1>; defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, X86fmaxc, HasAVX512, + SchedWriteFCmpSizes, 1>, + avx512_fp_binop_ph<0x5F, "vmax", X86fmaxc, X86fmaxc, SchedWriteFCmpSizes, 1>; } let Uses = [], mayRaiseFPException = 0 in { @@ -8945,6 +9018,30 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { } } +let Defs = [EFLAGS], Predicates = [HasFP16] in { + defm VUCOMISHZ : avx512_ord_cmp_sae<0x2E, v8f16x_info, "vucomish", + SSEPackedSingle>, AVX512PSIi8Base, T_MAP5PS, + EVEX_CD8<16, CD8VT1>; + defm VCOMISHZ : avx512_ord_cmp_sae<0x2F, v8f16x_info, "vcomish", + SSEPackedSingle>, AVX512PSIi8Base, T_MAP5PS, + EVEX_CD8<16, CD8VT1>; + defm VUCOMISHZ : sse12_ord_cmp<0x2E, FR16X, X86any_fcmp, f16, f16mem, loadf16, + "ucomish", SSEPackedSingle>, T_MAP5PS, EVEX, + VEX_LIG, EVEX_CD8<16, CD8VT1>; + defm VCOMISHZ : sse12_ord_cmp<0x2F, FR16X, X86strict_fcmps, f16, f16mem, loadf16, + "comish", SSEPackedSingle>, T_MAP5PS, EVEX, + VEX_LIG, EVEX_CD8<16, CD8VT1>; + let isCodeGenOnly = 1 in { + defm VUCOMISHZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v8f16, shmem, + sse_load_f16, "ucomish", SSEPackedSingle>, + T_MAP5PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>; + + defm VCOMISHZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v8f16, shmem, + sse_load_f16, "comish", SSEPackedSingle>, + T_MAP5PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>; + } +} + /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd multiclass avx512_fp14_s opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { @@ -11868,6 +11965,11 @@ defm : AVX512_scalar_math_fp_patterns; defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; + multiclass AVX512_scalar_unary_math_patterns { let Predicates = [HasAVX512] in { diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp index 17fe7f0bd310d..251d66575080e 100644 --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -1550,6 +1550,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 }, { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, { X86::VADDPDrr, X86::VADDPDrm, 0 }, + { X86::VADDPHZ128rr, X86::VADDPHZ128rm, 0 }, + { X86::VADDPHZ256rr, X86::VADDPHZ256rm, 0 }, + { X86::VADDPHZrr, X86::VADDPHZrm, 0 }, { X86::VADDPSYrr, X86::VADDPSYrm, 0 }, { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 }, { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 }, @@ -1559,6 +1562,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE }, { X86::VADDSDrr, X86::VADDSDrm, 0 }, { X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE }, + { X86::VADDSHZrr, X86::VADDSHZrm, 0 }, + { X86::VADDSHZrr_Int, X86::VADDSHZrm_Int, TB_NO_REVERSE }, { X86::VADDSSZrr, X86::VADDSSZrm, 0 }, { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE }, { X86::VADDSSrr, X86::VADDSSrm, 0 }, @@ -1642,6 +1647,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 }, { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 }, { X86::VCMPPDrri, X86::VCMPPDrmi, 0 }, + { X86::VCMPPHZ128rri, X86::VCMPPHZ128rmi, 0 }, + { X86::VCMPPHZ256rri, X86::VCMPPHZ256rmi, 0 }, + { X86::VCMPPHZrri, X86::VCMPPHZrmi, 0 }, { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 }, { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 }, { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 }, @@ -1651,6 +1659,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE }, { X86::VCMPSDrr, X86::VCMPSDrm, 0 }, { X86::VCMPSDrr_Int, X86::VCMPSDrm_Int, TB_NO_REVERSE }, + { X86::VCMPSHZrr, X86::VCMPSHZrm, 0 }, + { X86::VCMPSHZrr_Int, X86::VCMPSHZrm_Int, TB_NO_REVERSE }, { X86::VCMPSSZrr, X86::VCMPSSZrm, 0 }, { X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE }, { X86::VCMPSSrr, X86::VCMPSSrm, 0 }, @@ -1782,6 +1792,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 }, { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 }, { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, + { X86::VDIVPHZ128rr, X86::VDIVPHZ128rm, 0 }, + { X86::VDIVPHZ256rr, X86::VDIVPHZ256rm, 0 }, + { X86::VDIVPHZrr, X86::VDIVPHZrm, 0 }, { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 }, { X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 }, { X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 }, @@ -1791,6 +1804,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE }, { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE }, + { X86::VDIVSHZrr, X86::VDIVSHZrm, 0 }, + { X86::VDIVSHZrr_Int, X86::VDIVSHZrm_Int, TB_NO_REVERSE }, { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 }, { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE }, { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, @@ -1912,6 +1927,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 }, { X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 }, { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 }, + { X86::VMAXCPHZ128rr, X86::VMAXCPHZ128rm, 0 }, + { X86::VMAXCPHZ256rr, X86::VMAXCPHZ256rm, 0 }, + { X86::VMAXCPHZrr, X86::VMAXCPHZrm, 0 }, { X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 }, { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 }, { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 }, @@ -1919,6 +1937,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 }, { X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 }, { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 }, + { X86::VMAXCSHZrr, X86::VMAXCSHZrm, 0 }, { X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 }, { X86::VMAXCSSrr, X86::VMAXCSSrm, 0 }, { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 }, @@ -1926,6 +1945,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 }, { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 }, { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, + { X86::VMAXPHZ128rr, X86::VMAXPHZ128rm, 0 }, + { X86::VMAXPHZ256rr, X86::VMAXPHZ256rm, 0 }, + { X86::VMAXPHZrr, X86::VMAXPHZrm, 0 }, { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 }, { X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 }, { X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 }, @@ -1935,6 +1957,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE }, { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE }, + { X86::VMAXSHZrr, X86::VMAXSHZrm, 0 }, + { X86::VMAXSHZrr_Int, X86::VMAXSHZrm_Int, TB_NO_REVERSE }, { X86::VMAXSSZrr, X86::VMAXSSZrm, 0 }, { X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE }, { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, @@ -1944,6 +1968,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 }, { X86::VMINCPDZrr, X86::VMINCPDZrm, 0 }, { X86::VMINCPDrr, X86::VMINCPDrm, 0 }, + { X86::VMINCPHZ128rr, X86::VMINCPHZ128rm, 0 }, + { X86::VMINCPHZ256rr, X86::VMINCPHZ256rm, 0 }, + { X86::VMINCPHZrr, X86::VMINCPHZrm, 0 }, { X86::VMINCPSYrr, X86::VMINCPSYrm, 0 }, { X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 }, { X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 }, @@ -1951,6 +1978,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VMINCPSrr, X86::VMINCPSrm, 0 }, { X86::VMINCSDZrr, X86::VMINCSDZrm, 0 }, { X86::VMINCSDrr, X86::VMINCSDrm, 0 }, + { X86::VMINCSHZrr, X86::VMINCSHZrm, 0 }, { X86::VMINCSSZrr, X86::VMINCSSZrm, 0 }, { X86::VMINCSSrr, X86::VMINCSSrm, 0 }, { X86::VMINPDYrr, X86::VMINPDYrm, 0 }, @@ -1958,6 +1986,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 }, { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, { X86::VMINPDrr, X86::VMINPDrm, 0 }, + { X86::VMINPHZ128rr, X86::VMINPHZ128rm, 0 }, + { X86::VMINPHZ256rr, X86::VMINPHZ256rm, 0 }, + { X86::VMINPHZrr, X86::VMINPHZrm, 0 }, { X86::VMINPSYrr, X86::VMINPSYrm, 0 }, { X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 }, { X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 }, @@ -1967,6 +1998,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE }, { X86::VMINSDrr, X86::VMINSDrm, 0 }, { X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE }, + { X86::VMINSHZrr, X86::VMINSHZrm, 0 }, + { X86::VMINSHZrr_Int, X86::VMINSHZrm_Int, TB_NO_REVERSE }, { X86::VMINSSZrr, X86::VMINSSZrm, 0 }, { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE }, { X86::VMINSSrr, X86::VMINSSrm, 0 }, @@ -2021,6 +2054,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 }, { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, { X86::VMULPDrr, X86::VMULPDrm, 0 }, + { X86::VMULPHZ128rr, X86::VMULPHZ128rm, 0 }, + { X86::VMULPHZ256rr, X86::VMULPHZ256rm, 0 }, + { X86::VMULPHZrr, X86::VMULPHZrm, 0 }, { X86::VMULPSYrr, X86::VMULPSYrm, 0 }, { X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 }, { X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 }, @@ -2030,6 +2066,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE }, { X86::VMULSDrr, X86::VMULSDrm, 0 }, { X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE }, + { X86::VMULSHZrr, X86::VMULSHZrm, 0 }, + { X86::VMULSHZrr_Int, X86::VMULSHZrm_Int, TB_NO_REVERSE }, { X86::VMULSSZrr, X86::VMULSSZrm, 0 }, { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE }, { X86::VMULSSrr, X86::VMULSSrm, 0 }, @@ -2944,6 +2982,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 }, { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, + { X86::VSUBPHZ128rr, X86::VSUBPHZ128rm, 0 }, + { X86::VSUBPHZ256rr, X86::VSUBPHZ256rm, 0 }, + { X86::VSUBPHZrr, X86::VSUBPHZrm, 0 }, { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 }, { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 }, { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 }, @@ -2953,6 +2994,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE }, { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE }, + { X86::VSUBSHZrr, X86::VSUBSHZrm, 0 }, + { X86::VSUBSHZrr_Int, X86::VSUBSHZrm_Int, TB_NO_REVERSE }, { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 }, { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE }, { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, @@ -2999,10 +3042,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 }, { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 }, { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 }, + { X86::VADDPHZ128rrkz, X86::VADDPHZ128rmkz, 0 }, + { X86::VADDPHZ256rrkz, X86::VADDPHZ256rmkz, 0 }, + { X86::VADDPHZrrkz, X86::VADDPHZrmkz, 0 }, { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 }, { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 }, { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, { X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VADDSHZrr_Intkz, X86::VADDSHZrm_Intkz, TB_NO_REVERSE }, { X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE }, { X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 }, { X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 }, @@ -3041,10 +3088,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VCMPPDZ128rrik, X86::VCMPPDZ128rmik, 0 }, { X86::VCMPPDZ256rrik, X86::VCMPPDZ256rmik, 0 }, { X86::VCMPPDZrrik, X86::VCMPPDZrmik, 0 }, + { X86::VCMPPHZ128rrik, X86::VCMPPHZ128rmik, 0 }, + { X86::VCMPPHZ256rrik, X86::VCMPPHZ256rmik, 0 }, + { X86::VCMPPHZrrik, X86::VCMPPHZrmik, 0 }, { X86::VCMPPSZ128rrik, X86::VCMPPSZ128rmik, 0 }, { X86::VCMPPSZ256rrik, X86::VCMPPSZ256rmik, 0 }, { X86::VCMPPSZrrik, X86::VCMPPSZrmik, 0 }, { X86::VCMPSDZrr_Intk, X86::VCMPSDZrm_Intk, TB_NO_REVERSE }, + { X86::VCMPSHZrr_Intk, X86::VCMPSHZrm_Intk, TB_NO_REVERSE }, { X86::VCMPSSZrr_Intk, X86::VCMPSSZrm_Intk, TB_NO_REVERSE }, { X86::VCVTDQ2PDZ128rrk, X86::VCVTDQ2PDZ128rmk, TB_NO_REVERSE }, { X86::VCVTDQ2PDZ256rrk, X86::VCVTDQ2PDZ256rmk, 0 }, @@ -3141,10 +3192,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 }, { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 }, { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 }, + { X86::VDIVPHZ128rrkz, X86::VDIVPHZ128rmkz, 0 }, + { X86::VDIVPHZ256rrkz, X86::VDIVPHZ256rmkz, 0 }, + { X86::VDIVPHZrrkz, X86::VDIVPHZrmkz, 0 }, { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 }, { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 }, { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, { X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VDIVSHZrr_Intkz, X86::VDIVSHZrm_Intkz, TB_NO_REVERSE }, { X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE }, { X86::VDPBF16PSZ128r, X86::VDPBF16PSZ128m, 0 }, { X86::VDPBF16PSZ256r, X86::VDPBF16PSZ256m, 0 }, @@ -3521,30 +3576,44 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 }, { X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 }, { X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 }, + { X86::VMAXCPHZ128rrkz, X86::VMAXCPHZ128rmkz, 0 }, + { X86::VMAXCPHZ256rrkz, X86::VMAXCPHZ256rmkz, 0 }, + { X86::VMAXCPHZrrkz, X86::VMAXCPHZrmkz, 0 }, { X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 }, { X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 }, { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 }, { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 }, { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 }, { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 }, + { X86::VMAXPHZ128rrkz, X86::VMAXPHZ128rmkz, 0 }, + { X86::VMAXPHZ256rrkz, X86::VMAXPHZ256rmkz, 0 }, + { X86::VMAXPHZrrkz, X86::VMAXPHZrmkz, 0 }, { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 }, { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 }, { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, { X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VMAXSHZrr_Intkz, X86::VMAXSHZrm_Intkz, TB_NO_REVERSE }, { X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, TB_NO_REVERSE }, { X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 }, { X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 }, { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 }, + { X86::VMINCPHZ128rrkz, X86::VMINCPHZ128rmkz, 0 }, + { X86::VMINCPHZ256rrkz, X86::VMINCPHZ256rmkz, 0 }, + { X86::VMINCPHZrrkz, X86::VMINCPHZrmkz, 0 }, { X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 }, { X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 }, { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 }, { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 }, { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 }, { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, + { X86::VMINPHZ128rrkz, X86::VMINPHZ128rmkz, 0 }, + { X86::VMINPHZ256rrkz, X86::VMINPHZ256rmkz, 0 }, + { X86::VMINPHZrrkz, X86::VMINPHZrmkz, 0 }, { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 }, { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 }, { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, { X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VMINSHZrr_Intkz, X86::VMINSHZrm_Intkz, TB_NO_REVERSE }, { X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, TB_NO_REVERSE }, { X86::VMOVAPDZ128rrk, X86::VMOVAPDZ128rmk, TB_NO_REVERSE | TB_ALIGN_16 }, { X86::VMOVAPDZ256rrk, X86::VMOVAPDZ256rmk, TB_NO_REVERSE | TB_ALIGN_32 }, @@ -3588,10 +3657,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 }, { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 }, { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, + { X86::VMULPHZ128rrkz, X86::VMULPHZ128rmkz, 0 }, + { X86::VMULPHZ256rrkz, X86::VMULPHZ256rmkz, 0 }, + { X86::VMULPHZrrkz, X86::VMULPHZrmkz, 0 }, { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, { X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VMULSHZrr_Intkz, X86::VMULSHZrm_Intkz, TB_NO_REVERSE }, { X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE }, { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 }, { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 }, @@ -4319,10 +4392,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, + { X86::VSUBPHZ128rrkz, X86::VSUBPHZ128rmkz, 0 }, + { X86::VSUBPHZ256rrkz, X86::VSUBPHZ256rmkz, 0 }, + { X86::VSUBPHZrrkz, X86::VSUBPHZrmkz, 0 }, { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, { X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VSUBSHZrr_Intkz, X86::VSUBSHZrm_Intkz, TB_NO_REVERSE }, { X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE }, { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 }, { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 }, @@ -4348,10 +4425,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 }, { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 }, { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, + { X86::VADDPHZ128rrk, X86::VADDPHZ128rmk, 0 }, + { X86::VADDPHZ256rrk, X86::VADDPHZ256rmk, 0 }, + { X86::VADDPHZrrk, X86::VADDPHZrmk, 0 }, { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 }, { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 }, { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, { X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE }, + { X86::VADDSHZrr_Intk, X86::VADDSHZrm_Intk, TB_NO_REVERSE }, { X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE }, { X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 }, { X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 }, @@ -4382,10 +4463,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 }, { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 }, { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 }, + { X86::VDIVPHZ128rrk, X86::VDIVPHZ128rmk, 0 }, + { X86::VDIVPHZ256rrk, X86::VDIVPHZ256rmk, 0 }, + { X86::VDIVPHZrrk, X86::VDIVPHZrmk, 0 }, { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 }, { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 }, { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, { X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE }, + { X86::VDIVSHZrr_Intk, X86::VDIVSHZrm_Intk, TB_NO_REVERSE }, { X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE }, { X86::VDPBF16PSZ128rk, X86::VDPBF16PSZ128mk, 0 }, { X86::VDPBF16PSZ128rkz, X86::VDPBF16PSZ128mkz, 0 }, @@ -4701,38 +4786,56 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 }, { X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 }, { X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 }, + { X86::VMAXCPHZ128rrk, X86::VMAXCPHZ128rmk, 0 }, + { X86::VMAXCPHZ256rrk, X86::VMAXCPHZ256rmk, 0 }, + { X86::VMAXCPHZrrk, X86::VMAXCPHZrmk, 0 }, { X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 }, { X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 }, { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 }, { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 }, { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 }, { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 }, + { X86::VMAXPHZ128rrk, X86::VMAXPHZ128rmk, 0 }, + { X86::VMAXPHZ256rrk, X86::VMAXPHZ256rmk, 0 }, + { X86::VMAXPHZrrk, X86::VMAXPHZrmk, 0 }, { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 }, { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 }, { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, { X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, TB_NO_REVERSE }, + { X86::VMAXSHZrr_Intk, X86::VMAXSHZrm_Intk, TB_NO_REVERSE }, { X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, TB_NO_REVERSE }, { X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 }, { X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 }, { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 }, + { X86::VMINCPHZ128rrk, X86::VMINCPHZ128rmk, 0 }, + { X86::VMINCPHZ256rrk, X86::VMINCPHZ256rmk, 0 }, + { X86::VMINCPHZrrk, X86::VMINCPHZrmk, 0 }, { X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 }, { X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 }, { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 }, { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 }, { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 }, { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, + { X86::VMINPHZ128rrk, X86::VMINPHZ128rmk, 0 }, + { X86::VMINPHZ256rrk, X86::VMINPHZ256rmk, 0 }, + { X86::VMINPHZrrk, X86::VMINPHZrmk, 0 }, { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 }, { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 }, { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, { X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, TB_NO_REVERSE }, + { X86::VMINSHZrr_Intk, X86::VMINSHZrm_Intk, TB_NO_REVERSE }, { X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, TB_NO_REVERSE }, { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 }, { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 }, { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, + { X86::VMULPHZ128rrk, X86::VMULPHZ128rmk, 0 }, + { X86::VMULPHZ256rrk, X86::VMULPHZ256rmk, 0 }, + { X86::VMULPHZrrk, X86::VMULPHZrmk, 0 }, { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, { X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE }, + { X86::VMULSHZrr_Intk, X86::VMULSHZrm_Intk, TB_NO_REVERSE }, { X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE }, { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 }, { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 }, @@ -5248,10 +5351,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, + { X86::VSUBPHZ128rrk, X86::VSUBPHZ128rmk, 0 }, + { X86::VSUBPHZ256rrk, X86::VSUBPHZ256rmk, 0 }, + { X86::VSUBPHZrrk, X86::VSUBPHZrmk, 0 }, { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, { X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE }, + { X86::VSUBSHZrr_Intk, X86::VSUBSHZrm_Intk, TB_NO_REVERSE }, { X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE }, { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 }, { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 }, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 4709021822ac3..38a18fddac0ba 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2487,6 +2487,10 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI, case X86::VCMPSSZrr: case X86::VCMPPDZrri: case X86::VCMPPSZrri: + case X86::VCMPSHZrr: + case X86::VCMPPHZrri: + case X86::VCMPPHZ128rri: + case X86::VCMPPHZ256rri: case X86::VCMPPDZ128rri: case X86::VCMPPSZ128rri: case X86::VCMPPDZ256rri: @@ -6047,6 +6051,31 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, } } + if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) { + // These instructions only load 16 bits, we can't fold them if the + // destination register is wider than 16 bits (2 bytes), and its user + // instruction isn't scalar (SH). + switch (UserOpc) { + case X86::VADDSHZrr_Int: + case X86::VCMPSHZrr_Int: + case X86::VDIVSHZrr_Int: + case X86::VMAXSHZrr_Int: + case X86::VMINSHZrr_Int: + case X86::VMULSHZrr_Int: + case X86::VSUBSHZrr_Int: + case X86::VADDSHZrr_Intk: case X86::VADDSHZrr_Intkz: + case X86::VCMPSHZrr_Intk: + case X86::VDIVSHZrr_Intk: case X86::VDIVSHZrr_Intkz: + case X86::VMAXSHZrr_Intk: case X86::VMAXSHZrr_Intkz: + case X86::VMINSHZrr_Intk: case X86::VMINSHZrr_Intkz: + case X86::VMULSHZrr_Intk: case X86::VMULSHZrr_Intkz: + case X86::VSUBSHZrr_Intk: case X86::VSUBSHZrr_Intkz: + return false; + default: + return true; + } + } + return false; } @@ -8401,6 +8430,14 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case X86::VMINCSSrr: case X86::VMINCSDZrr: case X86::VMINCSSZrr: + case X86::VMAXCPHZ128rr: + case X86::VMAXCPHZ256rr: + case X86::VMAXCPHZrr: + case X86::VMAXCSHZrr: + case X86::VMINCPHZ128rr: + case X86::VMINCPHZ256rr: + case X86::VMINCPHZrr: + case X86::VMINCSHZrr: return true; case X86::ADDPDrr: case X86::ADDPSrr: @@ -8438,6 +8475,14 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case X86::VMULSSrr: case X86::VMULSDZrr: case X86::VMULSSZrr: + case X86::VADDPHZ128rr: + case X86::VADDPHZ256rr: + case X86::VADDPHZrr: + case X86::VADDSHZrr: + case X86::VMULPHZ128rr: + case X86::VMULPHZ256rr: + case X86::VMULPHZrr: + case X86::VMULSHZrr: return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && Inst.getFlag(MachineInstr::MIFlag::FmNsz); default: diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index de2500b8e1bd7..8c33624e28f0d 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -987,6 +987,34 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_256, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_512, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0), X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16), + X86_INTRINSIC_DATA(avx512fp16_add_ph_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), + X86_INTRINSIC_DATA(avx512fp16_div_ph_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_add_sh_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FADDS, X86ISD::FADDS_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_128, CMP_MASK_CC, X86ISD::CMPMM, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_256, CMP_MASK_CC, X86ISD::CMPMM, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_cmp_sh, CMP_MASK_SCALAR_CC, + X86ISD::FSETCCM, X86ISD::FSETCCM_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_div_sh_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FDIVS, X86ISD::FDIVS_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_max_sh_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FMAXS, X86ISD::FMAXS_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_min_sh_round, INTR_TYPE_SCALAR_MASK_SAE, + X86ISD::FMINS, X86ISD::FMINS_SAE), + X86_INTRINSIC_DATA(avx512fp16_mask_mul_sh_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FMULS, X86ISD::FMULS_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_sub_sh_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FSUBS, X86ISD::FSUBS_RND), + X86_INTRINSIC_DATA(avx512fp16_max_ph_128, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512fp16_max_ph_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx512fp16_max_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE), + X86_INTRINSIC_DATA(avx512fp16_min_ph_128, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512fp16_min_ph_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512fp16_min_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE), + X86_INTRINSIC_DATA(avx512fp16_mul_ph_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND), + X86_INTRINSIC_DATA(avx512fp16_sub_ph_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND), + X86_INTRINSIC_DATA(avx512fp16_vcomi_sh, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0), diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll new file mode 100644 index 0000000000000..2a0433cd23071 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll @@ -0,0 +1,284 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unkown-unkown -mattr=+avx512bw -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s + +declare <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_int_x86_avx512fp16_add_ph_512(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_add_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_add_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_add_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res = select <32 x i1> %mask, <32 x half> %res0, <32 x half> %src + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_maskz_add_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_add_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vaddph (%rsi), %zmm1, %zmm1 {%k1} {z} +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %val = load <32 x half>, <32 x half>* %ptr + %res0 = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer + %t2 = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %val, i32 4) + %res2 = select <32 x i1> %mask, <32 x half> %t2, <32 x half> zeroinitializer + %res3 = fadd <32 x half> %res1, %res2 + ret <32 x half> %res3 +} + +define <32 x half> @test_int_x86_avx512fp16_add_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_add_ph_512_round: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddph {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %t1 = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %x2, i32 10) + %res = select <32 x i1> %mask, <32 x half> %t1, <32 x half> %src + ret <32 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_int_x86_avx512fp16_sub_ph_512(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_sub_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_sub_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_sub_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res = select <32 x i1> %mask, <32 x half> %res0, <32 x half> %src + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_maskz_sub_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_sub_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vsubph (%rsi), %zmm1, %zmm1 {%k1} {z} +; CHECK-NEXT: vsubph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %val = load <32 x half>, <32 x half>* %ptr + %res0 = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer + %t2 = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %val, i32 4) + %res2 = select <32 x i1> %mask, <32 x half> %t2, <32 x half> zeroinitializer + %res3 = fsub <32 x half> %res1, %res2 + ret <32 x half> %res3 +} + +define <32 x half> @test_int_x86_avx512fp16_sub_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_sub_ph_512_round: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubph {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %t1 = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %x2, i32 10) + %res = select <32 x i1> %mask, <32 x half> %t1, <32 x half> %src + ret <32 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_int_x86_avx512fp16_mul_ph_512(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mul_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_mul_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_mul_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res = select <32 x i1> %mask, <32 x half> %res0, <32 x half> %src + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_maskz_mul_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_mul_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vmulph (%rsi), %zmm1, %zmm1 {%k1} {z} +; CHECK-NEXT: vmulph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %val = load <32 x half>, <32 x half>* %ptr + %res0 = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer + %t2 = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %val, i32 4) + %res2 = select <32 x i1> %mask, <32 x half> %t2, <32 x half> zeroinitializer + %res3 = fmul <32 x half> %res1, %res2 + ret <32 x half> %res3 +} + +define <32 x half> @test_int_x86_avx512fp16_mul_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mul_ph_512_round: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulph {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %t1 = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %x2, i32 10) + %res = select <32 x i1> %mask, <32 x half> %t1, <32 x half> %src + ret <32 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_int_x86_avx512fp16_div_ph_512(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_div_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_mask_div_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_div_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res = select <32 x i1> %mask, <32 x half> %res0, <32 x half> %src + ret <32 x half> %res +} + +define <32 x half> @test_int_x86_avx512fp16_maskz_div_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_div_ph_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vdivph (%rsi), %zmm1, %zmm1 {%k1} {z} +; CHECK-NEXT: vdivph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %val = load <32 x half>, <32 x half>* %ptr + %res0 = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4) + %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer + %t2 = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %val, i32 4) + %res2 = select <32 x i1> %mask, <32 x half> %t2, <32 x half> zeroinitializer + %res3 = fdiv <32 x half> %res1, %res2 + ret <32 x half> %res3 +} + +define <32 x half> @test_int_x86_avx512fp16_div_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, <32 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_div_ph_512_round: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivph {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %t1 = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %x2, i32 10) + %res = select <32 x i1> %mask, <32 x half> %t1, <32 x half> %src + ret <32 x half> %res +} + +declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_min_ph(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_min_ph: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res0 = fcmp olt <32 x half> %x1, %x2 + %res1 = select <32 x i1> %res0, <32 x half> %x1, <32 x half> %x2 + ret <32 x half> %res1 +} + +define <32 x half> @test_int_x86_avx512fp16_min_ph_512_sae(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_min_ph_512_sae: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph {sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res0 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %x1, <32 x half> %x2, i32 8) + ret <32 x half> %res0 +} + +define <32 x half> @test_int_x86_avx512fp16_maskz_min_ph_512_sae(<32 x half> %x1, <32 x half> %x2, i32 %msk) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_min_ph_512_sae: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vminph {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %x1, <32 x half> %x2, i32 8) + %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer + ret <32 x half> %res1 +} + +declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) + +define <32 x half> @test_max_ph(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_max_ph: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res0 = fcmp ogt <32 x half> %x1, %x2 + %res1 = select <32 x i1> %res0, <32 x half> %x1, <32 x half> %x2 + ret <32 x half> %res1 +} + +define <32 x half> @test_int_x86_avx512fp16_max_ph_512_sae(<32 x half> %x1, <32 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_max_ph_512_sae: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph {sae}, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %res0 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %x1, <32 x half> %x2, i32 8) + ret <32 x half> %res0 +} + +define <32 x half> @test_int_x86_avx512fp16_maskz_max_ph_512_sae(<32 x half> %x1, <32 x half> %x2, i32 %msk) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_max_ph_512_sae: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxph {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = bitcast i32 %msk to <32 x i1> + %res0 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %x1, <32 x half> %x2, i32 8) + %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer + ret <32 x half> %res1 +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll new file mode 100644 index 0000000000000..da79411006d18 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll @@ -0,0 +1,404 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unkown-unkown -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512fp16 | FileCheck %s + +define <16 x half> @test_int_x86_avx512fp16_add_ph_256(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_add_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = fadd <16 x half> %x1, %x2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_add_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_add_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %ymm2, %ymm3 +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vaddph (%rsi), %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vaddph %ymm2, %ymm3, %ymm0 +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %val = load <16 x half>, <16 x half>* %ptr + %res0 = fadd <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %src + %t3 = fadd <16 x half> %x1, %val + %res2 = select <16 x i1> %msk, <16 x half> %t3, <16 x half> %src + %res = fadd <16 x half> %res1 , %res2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_maskz_add_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_add_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %res0 = fadd <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res1 +} + +define <8 x half> @test_int_x86_avx512fp16_add_ph_128(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_add_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fadd <8 x half> %x1, %x2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_add_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_add_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vaddph (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddph %xmm2, %xmm3, %xmm0 +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %val = load <8 x half>, <8 x half>* %ptr + %res0 = fadd <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %src + %t3 = fadd <8 x half> %x1, %val + %res2 = select <8 x i1> %msk, <8 x half> %t3, <8 x half> %src + %res = fadd <8 x half> %res1 , %res2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_maskz_add_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_add_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %res0 = fadd <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res1 +} + +define <16 x half> @test_int_x86_avx512fp16_sub_ph_256(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_sub_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = fsub <16 x half> %x1, %x2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_sub_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_sub_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %ymm2, %ymm3 +; CHECK-NEXT: vsubph %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vsubph (%rsi), %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vsubph %ymm2, %ymm3, %ymm0 +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %val = load <16 x half>, <16 x half>* %ptr + %res0 = fsub <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %src + %t3 = fsub <16 x half> %x1, %val + %res2 = select <16 x i1> %msk, <16 x half> %t3, <16 x half> %src + %res = fsub <16 x half> %res1 , %res2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_maskz_sub_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_sub_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubph %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %res0 = fsub <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res1 +} + +define <8 x half> @test_int_x86_avx512fp16_sub_ph_128(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_sub_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fsub <8 x half> %x1, %x2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_sub_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_sub_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vsubph (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vsubph %xmm2, %xmm3, %xmm0 +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %val = load <8 x half>, <8 x half>* %ptr + %res0 = fsub <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %src + %t3 = fsub <8 x half> %x1, %val + %res2 = select <8 x i1> %msk, <8 x half> %t3, <8 x half> %src + %res = fsub <8 x half> %res1 , %res2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_maskz_sub_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_sub_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %res0 = fsub <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res1 +} + +define <16 x half> @test_int_x86_avx512fp16_mul_ph_256(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mul_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = fmul <16 x half> %x1, %x2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_mul_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_mul_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %ymm2, %ymm3 +; CHECK-NEXT: vmulph %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vmulph (%rsi), %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmulph %ymm2, %ymm3, %ymm0 +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %val = load <16 x half>, <16 x half>* %ptr + %res0 = fmul <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %src + %t3 = fmul <16 x half> %x1, %val + %res2 = select <16 x i1> %msk, <16 x half> %t3, <16 x half> %src + %res = fmul <16 x half> %res1 , %res2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_maskz_mul_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_mul_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulph %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %res0 = fmul <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res1 +} + +define <8 x half> @test_int_x86_avx512fp16_mul_ph_128(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_mul_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fmul <8 x half> %x1, %x2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_mul_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_mul_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vmulph %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vmulph (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmulph %xmm2, %xmm3, %xmm0 +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %val = load <8 x half>, <8 x half>* %ptr + %res0 = fmul <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %src + %t3 = fmul <8 x half> %x1, %val + %res2 = select <8 x i1> %msk, <8 x half> %t3, <8 x half> %src + %res = fmul <8 x half> %res1 , %res2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_maskz_mul_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_mul_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulph %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %res0 = fmul <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res1 +} + +define <16 x half> @test_int_x86_avx512fp16_div_ph_256(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_div_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = fdiv <16 x half> %x1, %x2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_mask_div_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_div_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %ymm2, %ymm3 +; CHECK-NEXT: vdivph %ymm1, %ymm0, %ymm3 {%k1} +; CHECK-NEXT: vdivph (%rsi), %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vdivph %ymm2, %ymm3, %ymm0 +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %val = load <16 x half>, <16 x half>* %ptr + %res0 = fdiv <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %src + %t3 = fdiv <16 x half> %x1, %val + %res2 = select <16 x i1> %msk, <16 x half> %t3, <16 x half> %src + %res = fdiv <16 x half> %res1 , %res2 + ret <16 x half> %res +} + +define <16 x half> @test_int_x86_avx512fp16_maskz_div_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, <16 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_div_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivph %ymm1, %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i16 %mask to <16 x i1> + %res0 = fdiv <16 x half> %x1, %x2 + %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res1 +} + +define <8 x half> @test_int_x86_avx512fp16_div_ph_128(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_int_x86_avx512fp16_div_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = fdiv <8 x half> %x1, %x2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_mask_div_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_div_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vdivph %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vdivph (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vdivph %xmm2, %xmm3, %xmm0 +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %val = load <8 x half>, <8 x half>* %ptr + %res0 = fdiv <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %src + %t3 = fdiv <8 x half> %x1, %val + %res2 = select <8 x i1> %msk, <8 x half> %t3, <8 x half> %src + %res = fdiv <8 x half> %res1 , %res2 + ret <8 x half> %res +} + +define <8 x half> @test_int_x86_avx512fp16_maskz_div_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, <8 x half>* %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_maskz_div_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivph %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %msk = bitcast i8 %mask to <8 x i1> + %res0 = fdiv <8 x half> %x1, %x2 + %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res1 +} + +define <16 x half> @test_min_ph_256(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_min_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res0 = fcmp olt <16 x half> %x1, %x2 + %res1 = select <16 x i1> %res0, <16 x half> %x1, <16 x half> %x2 + ret <16 x half> %res1 +} + +define <16 x half> @test_max_ph_256(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_max_ph_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res0 = fcmp ogt <16 x half> %x1, %x2 + %res1 = select <16 x i1> %res0, <16 x half> %x1, <16 x half> %x2 + ret <16 x half> %res1 +} + +define <8 x half> @test_min_ph_128(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_min_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res0 = fcmp olt <8 x half> %x1, %x2 + %res1 = select <8 x i1> %res0, <8 x half> %x1, <8 x half> %x2 + ret <8 x half> %res1 +} + +define <8 x half> @test_max_ph_128(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_max_ph_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res0 = fcmp ogt <8 x half> %x1, %x2 + %res1 = select <8 x i1> %res0, <8 x half> %x1, <8 x half> %x2 + ret <8 x half> %res1 +} + +declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>) +declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>) + +define <8 x half> @test_max_ph_128_2(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_max_ph_128_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res0 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %x1, <8 x half> %x2) + ret <8 x half> %res0 +} + +define <16 x half> @test_max_ph_256_2(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_max_ph_256_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res0 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %x1, <16 x half> %x2) + ret <16 x half> %res0 +} + +declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>) +declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>) + +define <8 x half> @test_min_ph_128_2(<8 x half> %x1, <8 x half> %x2) { +; CHECK-LABEL: test_min_ph_128_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res0 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %x1, <8 x half> %x2) + ret <8 x half> %res0 +} + +define <16 x half> @test_min_ph_256_2(<16 x half> %x1, <16 x half> %x2) { +; CHECK-LABEL: test_min_ph_256_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res0 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %x1, <16 x half> %x2) + ret <16 x half> %res0 +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll new file mode 100644 index 0000000000000..1b234387c07c2 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll @@ -0,0 +1,355 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=+avx512fp16 | FileCheck %s + +define <32 x half> @vaddph_512_test(<32 x half> %i, <32 x half> %j) nounwind readnone { +; CHECK-LABEL: vaddph_512_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %x = fadd <32 x half> %i, %j + ret <32 x half> %x +} + +define <32 x half> @vaddph_512_fold_test(<32 x half> %i, <32 x half>* %j) nounwind { +; CHECK-LABEL: vaddph_512_fold_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq + %tmp = load <32 x half>, <32 x half>* %j, align 4 + %x = fadd <32 x half> %i, %tmp + ret <32 x half> %x +} + +define <32 x half> @vaddph_512_broadc_test(<32 x half> %a) nounwind { +; CHECK-LABEL: vaddph_512_broadc_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm0, %zmm0 +; CHECK-NEXT: retq + %b = fadd <32 x half> %a, + ret <32 x half> %b +} + +define <16 x half> @vaddph_256_broadc_test(<16 x half> %a) nounwind { +; CHECK-LABEL: vaddph_256_broadc_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %ymm0, %ymm0 +; CHECK-NEXT: retq + %b = fadd <16 x half> %a, + ret <16 x half> %b +} + +define <8 x half> @vaddph_128_broadc_test(<8 x half> %a) nounwind { +; CHECK-LABEL: vaddph_128_broadc_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm0, %xmm0 +; CHECK-NEXT: retq + %b = fadd <8 x half> %a, + ret <8 x half> %b +} + +define <32 x half> @vaddph_512_mask_test1(<32 x half> %i, <32 x half> %j, <32 x i1> %mask) nounwind readnone { +; CHECK-LABEL: vaddph_512_mask_test1: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsllw $7, %ymm2, %ymm2 +; CHECK-NEXT: vpmovb2m %ymm2, %k1 +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq + %x = fadd <32 x half> %i, %j + %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> %i + ret <32 x half> %r +} + +define <32 x half> @vaddph_512_mask_test(<32 x half> %i, <32 x half> %j, <32 x half> %mask1) nounwind readnone { +; CHECK-LABEL: vaddph_512_mask_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpneq_oqph %zmm3, %zmm2, %k1 +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = fcmp one <32 x half> %mask1, zeroinitializer + %x = fadd <32 x half> %i, %j + %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> %i + ret <32 x half> %r +} + +define <32 x half> @vaddph_512_maskz_test(<32 x half> %i, <32 x half> %j, <32 x half> %mask1) nounwind readnone { +; CHECK-LABEL: vaddph_512_maskz_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpneq_oqph %zmm3, %zmm2, %k1 +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = fcmp one <32 x half> %mask1, zeroinitializer + %x = fadd <32 x half> %i, %j + %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> zeroinitializer + ret <32 x half> %r +} + +define <32 x half> @vaddph_512_mask_fold_test(<32 x half> %i, <32 x half>* %j.ptr, <32 x half> %mask1) nounwind readnone { +; CHECK-LABEL: vaddph_512_mask_fold_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpneq_oqph %zmm2, %zmm1, %k1 +; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq + %mask = fcmp one <32 x half> %mask1, zeroinitializer + %j = load <32 x half>, <32 x half>* %j.ptr + %x = fadd <32 x half> %i, %j + %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> %i + ret <32 x half> %r +} + +define <32 x half> @vaddph_512_maskz_fold_test(<32 x half> %i, <32 x half>* %j.ptr, <32 x half> %mask1) nounwind readnone { +; CHECK-LABEL: vaddph_512_maskz_fold_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpneq_oqph %zmm2, %zmm1, %k1 +; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = fcmp one <32 x half> %mask1, zeroinitializer + %j = load <32 x half>, <32 x half>* %j.ptr + %x = fadd <32 x half> %i, %j + %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> zeroinitializer + ret <32 x half> %r +} + +define <32 x half> @vaddph_512_maskz_fold_test_2(<32 x half> %i, <32 x half>* %j.ptr, <32 x half> %mask1) nounwind readnone { +; CHECK-LABEL: vaddph_512_maskz_fold_test_2: +; CHECK: ## %bb.0: +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpneq_oqph %zmm2, %zmm1, %k1 +; CHECK-NEXT: vaddph (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %mask = fcmp one <32 x half> %mask1, zeroinitializer + %j = load <32 x half>, <32 x half>* %j.ptr + %x = fadd <32 x half> %j, %i + %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> zeroinitializer + ret <32 x half> %r +} + +define <32 x half> @vsubph_512_test(<32 x half> %i, <32 x half> %j) nounwind readnone { +; CHECK-LABEL: vsubph_512_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %x = fsub <32 x half> %i, %j + ret <32 x half> %x +} + +define <32 x half> @vmulph_512_test(<32 x half> %i, <32 x half> %j) nounwind readnone { +; CHECK-LABEL: vmulph_512_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmulph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %x = fmul <32 x half> %i, %j + ret <32 x half> %x +} + +define <32 x half> @vdivph_512_test(<32 x half> %i, <32 x half> %j) nounwind readnone { +; CHECK-LABEL: vdivph_512_test: +; CHECK: ## %bb.0: +; CHECK-NEXT: vdivph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %x = fdiv <32 x half> %i, %j + ret <32 x half> %x +} + +define half @add_sh(half %i, half %j, half* %x.ptr) nounwind readnone { +; CHECK-LABEL: add_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %x = load half, half* %x.ptr + %y = fadd half %i, %j + %r = fadd half %x, %y + ret half %r +} + +define half @sub_sh(half %i, half %j, half* %x.ptr) nounwind readnone { +; CHECK-LABEL: sub_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovsh (%rdi), %xmm2 +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vsubsh %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %x = load half, half* %x.ptr + %y = fsub half %i, %j + %r = fsub half %x, %y + ret half %r +} + +define half @sub_sh_2(half %i, half %j, half* %x.ptr) nounwind readnone { +; CHECK-LABEL: sub_sh_2: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vsubsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %x = load half, half* %x.ptr + %y = fsub half %i, %j + %r = fsub half %y, %x + ret half %r +} + +define half @mul_sh(half %i, half %j, half* %x.ptr) nounwind readnone { +; CHECK-LABEL: mul_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmulsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmulsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %x = load half, half* %x.ptr + %y = fmul half %i, %j + %r = fmul half %x, %y + ret half %r +} + +define half @div_sh(half %i, half %j, half* %x.ptr) nounwind readnone { +; CHECK-LABEL: div_sh: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovsh (%rdi), %xmm2 +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vdivsh %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %x = load half, half* %x.ptr + %y = fdiv half %i, %j + %r = fdiv half %x, %y + ret half %r +} + +define half @div_sh_2(half %i, half %j, half* %x.ptr) nounwind readnone { +; CHECK-LABEL: div_sh_2: +; CHECK: ## %bb.0: +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vdivsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %x = load half, half* %x.ptr + %y = fdiv half %i, %j + %r = fdiv half %y, %x + ret half %r +} + +define i1 @cmp_une_sh(half %x, half %y) { +; CHECK-LABEL: cmp_une_sh: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpneqsh %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +entry: + %0 = fcmp une half %x, %y + ret i1 %0 +} + +define i1 @cmp_oeq_sh(half %x, half %y) { +; CHECK-LABEL: cmp_oeq_sh: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqsh %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: ## kill: def $al killed $al killed $eax +; CHECK-NEXT: retq +entry: + %0 = fcmp oeq half %x, %y + ret i1 %0 +} + +define i1 @cmp_olt_sh(half %x, half %y) { +; CHECK-LABEL: cmp_olt_sh: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vucomish %xmm0, %xmm1 +; CHECK-NEXT: seta %al +; CHECK-NEXT: retq + entry: + %0 = fcmp olt half %x, %y + ret i1 %0 +} + +define <32 x i1> @cmp_ph(<32 x half> %x, <32 x half> %y) { +; CHECK-LABEL: cmp_ph: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpneqph %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpmovm2b %k0, %ymm0 +; CHECK-NEXT: retq +entry: + %0 = fcmp une <32 x half> %x, %y + ret <32 x i1> %0 +} + +define <8 x i1> @fcmp_v8f16(<8 x half> %a, <8 x half> %b) +; CHECK-LABEL: fcmp_v8f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqph %xmm1, %xmm0, %k0 +; CHECK-NEXT: vpmovm2w %k0, %xmm0 +; CHECK-NEXT: retq +{ +entry: + %0 = fcmp oeq <8 x half> %a, %b + ret <8 x i1> %0 +} + +define <16 x i1> @fcmp_v16f16(<16 x half> %a, <16 x half> %b) +; CHECK-LABEL: fcmp_v16f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqph %ymm1, %ymm0, %k0 +; CHECK-NEXT: vpmovm2b %k0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +{ +entry: + %0 = fcmp oeq <16 x half> %a, %b + ret <16 x i1> %0 +} + +define <32 x i1> @fcmp_v32f16(<32 x half> %a, <32 x half> %b) +; CHECK-LABEL: fcmp_v32f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqph %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpmovm2b %k0, %ymm0 +; CHECK-NEXT: retq +{ +entry: + %0 = fcmp oeq <32 x half> %a, %b + ret <32 x i1> %0 +} + +define <8 x i16> @zext_fcmp_v8f16(<8 x half> %a, <8 x half> %b) +; CHECK-LABEL: zext_fcmp_v8f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqph %xmm1, %xmm0, %k0 +; CHECK-NEXT: vpmovm2w %k0, %xmm0 +; CHECK-NEXT: vpsrlw $15, %xmm0, %xmm0 +; CHECK-NEXT: retq +{ +entry: + %0 = fcmp oeq <8 x half> %a, %b + %1 = zext <8 x i1> %0 to <8 x i16> + ret <8 x i16> %1 +} + +define <16 x i16> @zext_fcmp_v16f16(<16 x half> %a, <16 x half> %b) +; CHECK-LABEL: zext_fcmp_v16f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqph %ymm1, %ymm0, %k0 +; CHECK-NEXT: vpmovm2w %k0, %ymm0 +; CHECK-NEXT: vpsrlw $15, %ymm0, %ymm0 +; CHECK-NEXT: retq +{ +entry: + %0 = fcmp oeq <16 x half> %a, %b + %1 = zext <16 x i1> %0 to <16 x i16> + ret <16 x i16> %1 +} + +define <32 x i16> @zext_fcmp_v32f16(<32 x half> %a, <32 x half> %b) +; CHECK-LABEL: zext_fcmp_v32f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vcmpeqph %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpmovm2w %k0, %zmm0 +; CHECK-NEXT: vpsrlw $15, %zmm0, %zmm0 +; CHECK-NEXT: retq +{ +entry: + %0 = fcmp oeq <32 x half> %a, %b + %1 = zext <32 x i1> %0 to <32 x i16> + ret <32 x i16> %1 +} + diff --git a/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll new file mode 100644 index 0000000000000..424d6ad759065 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK + +declare half @llvm.maxnum.f16(half, half) +declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) +declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>) +declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>) +declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>) +declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>) + +define half @test_intrinsic_fmaxh(half %x, half %y) { +; CHECK-LABEL: test_intrinsic_fmaxh: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0] +; CHECK-NEXT: vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call half @llvm.maxnum.f16(half %x, half %y) readnone + ret half %z +} + +define <2 x half> @test_intrinsic_fmax_v2f16(<2 x half> %x, <2 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmax_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0] +; CHECK-NEXT: vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %x, <2 x half> %y) readnone + ret <2 x half> %z +} + +define <4 x half> @test_intrinsic_fmax_v4f16(<4 x half> %x, <4 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmax_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0] +; CHECK-NEXT: vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %x, <4 x half> %y) readnone + ret <4 x half> %z +} + +define <8 x half> @test_intrinsic_fmax_v8f16(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmax_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0] +; CHECK-NEXT: vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %x, <8 x half> %y) readnone + ret <8 x half> %z +} + +define <16 x half> @test_intrinsic_fmax_v16f16(<16 x half> %x, <16 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmax_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %ymm0, %ymm1, %ymm2 # encoding: [0x62,0xf5,0x74,0x28,0x5f,0xd0] +; CHECK-NEXT: vcmpunordph %ymm0, %ymm0, %k1 # encoding: [0x62,0xf3,0x7c,0x28,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %x, <16 x half> %y) readnone + ret <16 x half> %z +} + +define <32 x half> @test_intrinsic_fmax_v32f16(<32 x half> %x, <32 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmax_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm2 # encoding: [0x62,0xf5,0x74,0x48,0x5f,0xd0] +; CHECK-NEXT: vcmpunordph %zmm0, %zmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1] +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <32 x half> @llvm.maxnum.v32f16(<32 x half> %x, <32 x half> %y) readnone + ret <32 x half> %z +} + +define <4 x half> @maxnum_intrinsic_nnan_fmf_f432(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: maxnum_intrinsic_nnan_fmf_f432: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxph %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x5f,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %r = tail call nnan <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) + ret <4 x half> %r +} + +define half @maxnum_intrinsic_nnan_attr_f16(half %a, half %b) #0 { +; CHECK-LABEL: maxnum_intrinsic_nnan_attr_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5f,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %r = tail call half @llvm.maxnum.f16(half %a, half %b) + ret half %r +} + +define half @test_maxnum_const_op1(half %x) { +; CHECK-LABEL: test_maxnum_const_op1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5f,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq # encoding: [0xc3] + %r = call half @llvm.maxnum.f16(half 1.0, half %x) + ret half %r +} + +define half @test_maxnum_const_op2(half %x) { +; CHECK-LABEL: test_maxnum_const_op2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmaxsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5f,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq # encoding: [0xc3] + %r = call half @llvm.maxnum.f16(half %x, half 1.0) + ret half %r +} + +define half @test_maxnum_const_nan(half %x) { +; CHECK-LABEL: test_maxnum_const_nan: +; CHECK: # %bb.0: +; CHECK-NEXT: retq # encoding: [0xc3] + %r = call half @llvm.maxnum.f16(half %x, half 0x7fff000000000000) + ret half %r +} + +attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true"} diff --git a/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll b/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll new file mode 100644 index 0000000000000..4ff9056fd791a --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK + +declare half @llvm.minnum.f16(half, half) +declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) +declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>) +declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>) +declare <16 x half> @llvm.minnum.v16f16(<16 x half>, <16 x half>) +declare <32 x half> @llvm.minnum.v32f16(<32 x half>, <32 x half>) + +define half @test_intrinsic_fminh(half %x, half %y) { +; CHECK-LABEL: test_intrinsic_fminh: +; CHECK: # %bb.0: +; CHECK-NEXT: vminsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xd0] +; CHECK-NEXT: vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1] +; CHECK-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call half @llvm.minnum.f16(half %x, half %y) readnone + ret half %z +} + +define <2 x half> @test_intrinsic_fmin_v2f16(<2 x half> %x, <2 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmin_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0] +; CHECK-NEXT: vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <2 x half> @llvm.minnum.v2f16(<2 x half> %x, <2 x half> %y) readnone + ret <2 x half> %z +} + +define <4 x half> @test_intrinsic_fmin_v4f16(<4 x half> %x, <4 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmin_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0] +; CHECK-NEXT: vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <4 x half> @llvm.minnum.v4f16(<4 x half> %x, <4 x half> %y) readnone + ret <4 x half> %z +} + +define <8 x half> @test_intrinsic_fmin_v8f16(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmin_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0] +; CHECK-NEXT: vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <8 x half> @llvm.minnum.v8f16(<8 x half> %x, <8 x half> %y) readnone + ret <8 x half> %z +} + +define <16 x half> @test_intrinsic_fmin_v16f16(<16 x half> %x, <16 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmin_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %ymm0, %ymm1, %ymm2 # encoding: [0x62,0xf5,0x74,0x28,0x5d,0xd0] +; CHECK-NEXT: vcmpunordph %ymm0, %ymm0, %k1 # encoding: [0x62,0xf3,0x7c,0x28,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <16 x half> @llvm.minnum.v16f16(<16 x half> %x, <16 x half> %y) readnone + ret <16 x half> %z +} + +define <32 x half> @test_intrinsic_fmin_v32f16(<32 x half> %x, <32 x half> %y) { +; CHECK-LABEL: test_intrinsic_fmin_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm2 # encoding: [0x62,0xf5,0x74,0x48,0x5d,0xd0] +; CHECK-NEXT: vcmpunordph %zmm0, %zmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03] +; CHECK-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1] +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; CHECK-NEXT: retq # encoding: [0xc3] + %z = call <32 x half> @llvm.minnum.v32f16(<32 x half> %x, <32 x half> %y) readnone + ret <32 x half> %z +} + +define <4 x half> @minnum_intrinsic_nnan_fmf_f432(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: minnum_intrinsic_nnan_fmf_f432: +; CHECK: # %bb.0: +; CHECK-NEXT: vminph %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x5d,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %r = tail call nnan <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) + ret <4 x half> %r +} + +define half @minnum_intrinsic_nnan_attr_f16(half %a, half %b) #0 { +; CHECK-LABEL: minnum_intrinsic_nnan_attr_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5d,0xc1] +; CHECK-NEXT: retq # encoding: [0xc3] + %r = tail call half @llvm.minnum.f16(half %a, half %b) + ret half %r +} + +define half @test_minnum_const_op1(half %x) { +; CHECK-LABEL: test_minnum_const_op1: +; CHECK: # %bb.0: +; CHECK-NEXT: vminsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5d,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq # encoding: [0xc3] + %r = call half @llvm.minnum.f16(half 1.0, half %x) + ret half %r +} + +define half @test_minnum_const_op2(half %x) { +; CHECK-LABEL: test_minnum_const_op2: +; CHECK: # %bb.0: +; CHECK-NEXT: vminsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5d,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; CHECK-NEXT: retq # encoding: [0xc3] + %r = call half @llvm.minnum.f16(half %x, half 1.0) + ret half %r +} + +define half @test_minnum_const_nan(half %x) { +; CHECK-LABEL: test_minnum_const_nan: +; CHECK: # %bb.0: +; CHECK-NEXT: retq # encoding: [0xc3] + %r = call half @llvm.minnum.f16(half %x, half 0x7fff000000000000) + ret half %r +} + +attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll b/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll new file mode 100644 index 0000000000000..4e8d1f16a655f --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s + +; Verify that we're folding the load into the math instruction. +; This pattern is generated out of the simplest intrinsics usage: +; _mm_add_ss(a, _mm_load_ss(b)); + +define <8 x half> @addsh(<8 x half> %va, half* %pb) { +; CHECK-LABEL: addsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = extractelement <8 x half> %va, i32 0 + %b = load half, half* %pb + %r = fadd half %a, %b + %vr = insertelement <8 x half> %va, half %r, i32 0 + ret <8 x half> %vr +} + +define <8 x half> @subsh(<8 x half> %va, half* %pb) { +; CHECK-LABEL: subsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = extractelement <8 x half> %va, i32 0 + %b = load half, half* %pb + %r = fsub half %a, %b + %vr = insertelement <8 x half> %va, half %r, i32 0 + ret <8 x half> %vr +} + +define <8 x half> @mulsh(<8 x half> %va, half* %pb) { +; CHECK-LABEL: mulsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = extractelement <8 x half> %va, i32 0 + %b = load half, half* %pb + %r = fmul half %a, %b + %vr = insertelement <8 x half> %va, half %r, i32 0 + ret <8 x half> %vr +} + +define <8 x half> @divsh(<8 x half> %va, half* %pb) { +; CHECK-LABEL: divsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsh (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = extractelement <8 x half> %va, i32 0 + %b = load half, half* %pb + %r = fdiv half %a, %b + %vr = insertelement <8 x half> %va, half %r, i32 0 + ret <8 x half> %vr +} + +define <8 x half> @minsh(<8 x half> %va, half* %pb) { +; CHECK-LABEL: minsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vminsh (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = extractelement <8 x half> %va, i32 0 + %b = load half, half* %pb + %r = call nnan half @llvm.minnum.f16(half %a, half %b) readnone + %vr = insertelement <8 x half> %va, half %r, i32 0 + ret <8 x half> %vr +} + +define <8 x half> @maxsh(<8 x half> %va, half* %pb) { +; CHECK-LABEL: maxsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vminsh (%rdi), %xmm0, %xmm1 +; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = extractelement <8 x half> %va, i32 0 + %b = load half, half* %pb + %r = call nnan half @llvm.minnum.f16(half %a, half %b) readnone + %vr = insertelement <8 x half> %va, half %r, i32 0 + ret <8 x half> %vr +} + +declare half @llvm.minnum.f16(half, half) +declare half @llvm.maxnum.f16(half, half) diff --git a/llvm/test/CodeGen/X86/avx512fp16-fold-xmm-zero.ll b/llvm/test/CodeGen/X86/avx512fp16-fold-xmm-zero.ll new file mode 100644 index 0000000000000..33d4cc164fb68 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fold-xmm-zero.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -mtriple=i386-apple-macosx10.6.7 -mattr=+avx512fp16 -no-integrated-as | FileCheck %s + +; Simple test to make sure folding for special constants (like half zero) +; isn't completely broken. + +; CHECK: vdivsh LCPI0 + +%0 = type { half, half, half, half, half, half, half, half } + +define void @f() nounwind ssp { +entry: + %0 = tail call %0 asm sideeffect "foo", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(half 1.000000e+00, half 2.000000e+00, half 3.000000e+00, half 4.000000e+00, half 5.000000e+00, half 6.000000e+00, half 7.000000e+00, half 8.000000e+00) nounwind + %asmresult = extractvalue %0 %0, 0 + %asmresult8 = extractvalue %0 %0, 1 + %asmresult9 = extractvalue %0 %0, 2 + %asmresult10 = extractvalue %0 %0, 3 + %asmresult11 = extractvalue %0 %0, 4 + %asmresult12 = extractvalue %0 %0, 5 + %asmresult13 = extractvalue %0 %0, 6 + %asmresult14 = extractvalue %0 %0, 7 + %div = fdiv half %asmresult, 0.000000e+00 + %1 = tail call %0 asm sideeffect "bar", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(half %div, half %asmresult8, half %asmresult9, half %asmresult10, half %asmresult11, half %asmresult12, half %asmresult13, half %asmresult14) nounwind + %asmresult24 = extractvalue %0 %1, 0 + %asmresult25 = extractvalue %0 %1, 1 + %asmresult26 = extractvalue %0 %1, 2 + %asmresult27 = extractvalue %0 %1, 3 + %asmresult28 = extractvalue %0 %1, 4 + %asmresult29 = extractvalue %0 %1, 5 + %asmresult30 = extractvalue %0 %1, 6 + %asmresult31 = extractvalue %0 %1, 7 + %div33 = fdiv half %asmresult24, 0.000000e+00 + %2 = tail call %0 asm sideeffect "baz", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(half %div33, half %asmresult25, half %asmresult26, half %asmresult27, half %asmresult28, half %asmresult29, half %asmresult30, half %asmresult31) nounwind + ret void +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll new file mode 100644 index 0000000000000..d2ff3a1215dd7 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll @@ -0,0 +1,381 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s + +; Test cases derived from float/double tests in fp-logic.ll + +; 1 FP operand, 1 int operand, int result + +define i16 @f1(half %x, i16 %y) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: andl %edi, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %bc1, %y + ret i16 %and +} + +; Swap operands of the logic op. + +define i16 @f2(half %x, i16 %y) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: andl %edi, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %y, %bc1 + ret i16 %and +} + +; 1 FP operand, 1 constant operand, int result + +define i16 @f3(half %x) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %bc1, 1 + ret i16 %and +} + +; Swap operands of the logic op. + +define i16 @f4(half %x) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: andl $2, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 2, %bc1 + ret i16 %and +} + +; 1 FP operand, 1 integer operand, FP result + +define half @f5(half %x, i16 %y) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %edi, %xmm1 +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %bc1, %y + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +; Swap operands of the logic op. + +define half @f6(half %x, i16 %y) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %edi, %xmm1 +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %y, %bc1 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +; 1 FP operand, 1 constant operand, FP result + +define half @f7(half %x) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %bc1, 3 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +; Swap operands of the logic op. + +define half @f8(half %x) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 4, %bc1 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +; 2 FP operands, int result + +define i16 @f9(half %x, half %y) { +; CHECK-LABEL: f9: +; CHECK: # %bb.0: +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %bc2 = bitcast half %y to i16 + %and = and i16 %bc1, %bc2 + ret i16 %and +} + +; 2 FP operands, FP result + +define half @f10(half %x, half %y) { +; CHECK-LABEL: f10: +; CHECK: # %bb.0: +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %bc2 = bitcast half %y to i16 + %and = and i16 %bc1, %bc2 + %bc3 = bitcast i16 %and to half + ret half %bc3 +} + +define half @or(half %x, half %y) { +; CHECK-LABEL: or: +; CHECK: # %bb.0: +; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %bc2 = bitcast half %y to i16 + %and = or i16 %bc1, %bc2 + %bc3 = bitcast i16 %and to half + ret half %bc3 +} + +define half @xor(half %x, half %y) { +; CHECK-LABEL: xor: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %bc2 = bitcast half %y to i16 + %and = xor i16 %bc1, %bc2 + %bc3 = bitcast i16 %and to half + ret half %bc3 +} + +define half @f7_or(half %x) { +; CHECK-LABEL: f7_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = or i16 %bc1, 3 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +define half @f7_xor(half %x) { +; CHECK-LABEL: f7_xor: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = xor i16 %bc1, 3 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +; Grabbing the sign bit is a special case that could be handled +; by movmskps/movmskpd, but if we're not shifting it over, then +; a simple FP logic op is cheaper. + +define half @movmsk(half %x) { +; CHECK-LABEL: movmsk: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %bc1, 32768 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +define half @bitcast_fabs(half %x) { +; CHECK-LABEL: bitcast_fabs: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %and = and i16 %bc1, 32767 + %bc2 = bitcast i16 %and to half + ret half %bc2 +} + +define half @bitcast_fneg(half %x) { +; CHECK-LABEL: bitcast_fneg: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %x to i16 + %xor = xor i16 %bc1, 32768 + %bc2 = bitcast i16 %xor to half + ret half %bc2 +} + +define <8 x half> @bitcast_fabs_vec(<8 x half> %x) { +; CHECK-LABEL: bitcast_fabs_vec: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %x to <8 x i16> + %and = and <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %and to <8 x half> + ret <8 x half> %bc2 +} + +define <8 x half> @bitcast_fneg_vec(<8 x half> %x) { +; CHECK-LABEL: bitcast_fneg_vec: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %x to <8 x i16> + %xor = xor <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %xor to <8 x half> + ret <8 x half> %bc2 +} + +define half @fadd_bitcast_fneg(half %x, half %y) { +; CHECK-LABEL: fadd_bitcast_fneg: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %y to i16 + %xor = xor i16 %bc1, 32768 + %bc2 = bitcast i16 %xor to half + %fadd = fadd half %x, %bc2 + ret half %fadd +} + +define half @fsub_bitcast_fneg(half %x, half %y) { +; CHECK-LABEL: fsub_bitcast_fneg: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: vxorps %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast half %y to i16 + %xor = xor i16 %bc1, 32767 + %bc2 = bitcast i16 %xor to half + %fsub = fsub half %x, %bc2 + ret half %fsub +} + +define half @nabs(half %a) { +; CHECK-LABEL: nabs: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %conv = bitcast half %a to i16 + %and = or i16 %conv, -32768 + %conv1 = bitcast i16 %and to half + ret half %conv1 +} + +define <8 x half> @nabsv8f16(<8 x half> %a) { +; CHECK-LABEL: nabsv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %conv = bitcast <8 x half> %a to <8 x i16> + %and = or <8 x i16> %conv, + %conv1 = bitcast <8 x i16> %and to <8 x half> + ret <8 x half> %conv1 +} + +define <8 x half> @fadd_bitcast_fneg_vec(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fadd_bitcast_fneg_vec: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %y to <8 x i16> + %xor = xor <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %xor to <8 x half> + %fadd = fadd <8 x half> %x, %bc2 + ret <8 x half> %fadd +} + +define <8 x half> @fadd_bitcast_fneg_vec_undef_elts(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fadd_bitcast_fneg_vec_undef_elts: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %y to <8 x i16> + %xor = xor <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %xor to <8 x half> + %fadd = fadd <8 x half> %x, %bc2 + ret <8 x half> %fadd +} + +define <8 x half> @fsub_bitcast_fneg_vec(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fsub_bitcast_fneg_vec: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %y to <8 x i16> + %xor = xor <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %xor to <8 x half> + %fsub = fsub <8 x half> %x, %bc2 + ret <8 x half> %fsub +} + +define <8 x half> @fsub_bitcast_fneg_vec_undef_elts(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fsub_bitcast_fneg_vec_undef_elts: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %y to <8 x i16> + %xor = xor <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %xor to <8 x half> + %fsub = fsub <8 x half> %x, %bc2 + ret <8 x half> %fsub +} + +define <8 x half> @fadd_bitcast_fneg_vec_width(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fadd_bitcast_fneg_vec_width: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %y to <2 x i64> + %xor = xor <2 x i64> %bc1, + %bc2 = bitcast <2 x i64> %xor to <8 x half> + %fadd = fadd <8 x half> %x, %bc2 + ret <8 x half> %fadd +} + +define <8 x half> @fsub_bitcast_fneg_vec_width(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: fsub_bitcast_fneg_vec_width: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %bc1 = bitcast <8 x half> %y to <2 x i64> + %xor = xor <2 x i64> %bc1, + %bc2 = bitcast <2 x i64> %xor to <8 x half> + %fsub = fsub <8 x half> %x, %bc2 + ret <8 x half> %fsub +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll new file mode 100644 index 0000000000000..eb6511e0edc73 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s + +declare i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half>, <8 x half>, i32, i32) + +define i32 @test_x86_avx512fp16_ucomi_sh_lt(<8 x half> %a0, <8 x half> %a1) { +; CHECK-LABEL: test_x86_avx512fp16_ucomi_sh_lt: +; CHECK: # %bb.0: +; CHECK-NEXT: vcmpngesh %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: retq + %res = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %a0, <8 x half> %a1, i32 9, i32 4) + ret i32 %res +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_add_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vaddsh %xmm1, %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vaddsh (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %val.half = load half,half * %ptr + %val = insertelement <8 x half> undef, half %val.half, i32 0 + %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4) + %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4) + ret <8 x half> %res3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_sub_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vsubsh %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vsubsh %xmm1, %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vsubsh (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %val.half = load half,half * %ptr + %val = insertelement <8 x half> undef, half %val.half, i32 0 + %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4) + %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4) + ret <8 x half> %res3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_mul_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmulsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vmulsh %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vmulsh %xmm1, %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmulsh (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %val.half = load half,half * %ptr + %val = insertelement <8 x half> undef, half %val.half, i32 0 + %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4) + %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4) + ret <8 x half> %res3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_div_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vdivsh %xmm1, %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vdivsh (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %val.half = load half,half * %ptr + %val = insertelement <8 x half> undef, half %val.half, i32 0 + %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4) + %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4) + ret <8 x half> %res3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_min_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vminsh %xmm1, %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vminsh (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %val.half = load half,half * %ptr + %val = insertelement <8 x half> undef, half %val.half, i32 0 + %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4) + %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4) + ret <8 x half> %res3 +} + +declare <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) { +; CHECK-LABEL: test_int_x86_avx512fp16_mask_max_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vmaxsh %xmm1, %xmm3, %xmm0 {%k1} {z} +; CHECK-NEXT: vmaxsh (%rsi), %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %val.half = load half,half * %ptr + %val = insertelement <8 x half> undef, half %val.half, i32 0 + %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4) + %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4) + %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4) + %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4) + ret <8 x half> %res3 +} + +declare i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half>, <8 x half>, i32, i8, i32) + +define i8 @test_int_x86_avx512_mask_cmp_sh(<8 x half> %x0, <8 x half> %x1, i8 %x3, i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sh: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmpunordsh %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq + %res2 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 3, i8 %x3, i32 4) + ret i8 %res2 +} + + +define i8 @test_int_x86_avx512_mask_cmp_sh_all(<8 x half> %x0, <8 x half> %x1, i8 %x3, i32 %x4) { +; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sh_all: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vcmplesh %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %ecx +; CHECK-NEXT: vcmpunordsh {sae}, %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovd %k0, %edx +; CHECK-NEXT: vcmpneqsh %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %esi +; CHECK-NEXT: vcmpnltsh {sae}, %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: andb %sil, %al +; CHECK-NEXT: andb %dl, %al +; CHECK-NEXT: andb %cl, %al +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq + %res1 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 2, i8 -1, i32 4) + %res2 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 3, i8 -1, i32 8) + %res3 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 4, i8 %x3, i32 4) + %res4 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 5, i8 %x3, i32 8) + + %res11 = and i8 %res1, %res2 + %res12 = and i8 %res3, %res4 + %res13 = and i8 %res11, %res12 + ret i8 %res13 +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll b/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll new file mode 100644 index 0000000000000..d8ab0de2ca97b --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll @@ -0,0 +1,345 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512fp16 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s + +; Incremental updates of the instruction depths should be enough for this test +; case. +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512fp16 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s + +; Verify that the first two adds are independent regardless of how the inputs are +; commuted. The destination registers are used as source registers for the third add. + +define half @reassociate_adds1(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_adds1: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz half %x0, %x1 + %t1 = fadd reassoc nsz half %t0, %x2 + %t2 = fadd reassoc nsz half %t1, %x3 + ret half %t2 +} + +define half @reassociate_adds2(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_adds2: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz half %x0, %x1 + %t1 = fadd reassoc nsz half %x2, %t0 + %t2 = fadd reassoc nsz half %t1, %x3 + ret half %t2 +} + +define half @reassociate_adds3(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_adds3: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz half %x0, %x1 + %t1 = fadd reassoc nsz half %t0, %x2 + %t2 = fadd reassoc nsz half %x3, %t1 + ret half %t2 +} + +define half @reassociate_adds4(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_adds4: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz half %x0, %x1 + %t1 = fadd reassoc nsz half %x2, %t0 + %t2 = fadd reassoc nsz half %x3, %t1 + ret half %t2 +} + +; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not +; produced because that would cost more compile time. + +define half @reassociate_adds5(half %x0, half %x1, half %x2, half %x3, half %x4, half %x5, half %x6, half %x7) { +; CHECK-LABEL: reassociate_adds5: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm5, %xmm4, %xmm1 +; CHECK-NEXT: vaddsh %xmm6, %xmm1, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm7, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz half %x0, %x1 + %t1 = fadd reassoc nsz half %t0, %x2 + %t2 = fadd reassoc nsz half %t1, %x3 + %t3 = fadd reassoc nsz half %t2, %x4 + %t4 = fadd reassoc nsz half %t3, %x5 + %t5 = fadd reassoc nsz half %t4, %x6 + %t6 = fadd reassoc nsz half %t5, %x7 + ret half %t6 +} + +; Verify that we only need two associative operations to reassociate the operands. +; Also, we should reassociate such that the result of the high latency division +; is used by the final 'add' rather than reassociating the %x3 operand with the +; division. The latter reassociation would not improve anything. + +define half @reassociate_adds6(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_adds6: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fdiv reassoc nsz half %x0, %x1 + %t1 = fadd reassoc nsz half %x2, %t0 + %t2 = fadd reassoc nsz half %x3, %t1 + ret half %t2 +} + +; Verify that SSE and AVX scalar single-precision multiplies are reassociated. + +define half @reassociate_muls1(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_muls1: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmulsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vmulsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fdiv reassoc nsz half %x0, %x1 + %t1 = fmul reassoc nsz half %x2, %t0 + %t2 = fmul reassoc nsz half %x3, %t1 + ret half %t2 +} + +; Verify that SSE and AVX 128-bit vector half-precision adds are reassociated. + +define <8 x half> @reassociate_adds_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { +; CHECK-LABEL: reassociate_adds_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddph %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fdiv reassoc nsz <8 x half> %x0, %x1 + %t1 = fadd reassoc nsz <8 x half> %x2, %t0 + %t2 = fadd reassoc nsz <8 x half> %x3, %t1 + ret <8 x half> %t2 +} + +; Verify that SSE and AVX 128-bit vector half-precision multiplies are reassociated. + +define <8 x half> @reassociate_muls_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { +; CHECK-LABEL: reassociate_muls_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmulph %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vmulph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz <8 x half> %x0, %x1 + %t1 = fmul reassoc nsz <8 x half> %x2, %t0 + %t2 = fmul reassoc nsz <8 x half> %x3, %t1 + ret <8 x half> %t2 +} + +; Verify that AVX 256-bit vector half-precision adds are reassociated. + +define <16 x half> @reassociate_adds_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) { +; CHECK-LABEL: reassociate_adds_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddph %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t0 = fdiv reassoc nsz <16 x half> %x0, %x1 + %t1 = fadd reassoc nsz <16 x half> %x2, %t0 + %t2 = fadd reassoc nsz <16 x half> %x3, %t1 + ret <16 x half> %t2 +} + +; Verify that AVX 256-bit vector half-precision multiplies are reassociated. + +define <16 x half> @reassociate_muls_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) { +; CHECK-LABEL: reassociate_muls_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmulph %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vmulph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz <16 x half> %x0, %x1 + %t1 = fmul reassoc nsz <16 x half> %x2, %t0 + %t2 = fmul reassoc nsz <16 x half> %x3, %t1 + ret <16 x half> %t2 +} + +; Verify that AVX512 512-bit vector half-precision adds are reassociated. + +define <32 x half> @reassociate_adds_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) { +; CHECK-LABEL: reassociate_adds_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddph %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t0 = fdiv reassoc nsz <32 x half> %x0, %x1 + %t1 = fadd reassoc nsz <32 x half> %x2, %t0 + %t2 = fadd reassoc nsz <32 x half> %x3, %t1 + ret <32 x half> %t2 +} + +; Verify that AVX512 512-bit vector half-precision multiplies are reassociated. + +define <32 x half> @reassociate_muls_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) { +; CHECK-LABEL: reassociate_muls_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmulph %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vmulph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t0 = fadd reassoc nsz <32 x half> %x0, %x1 + %t1 = fmul reassoc nsz <32 x half> %x2, %t0 + %t2 = fmul reassoc nsz <32 x half> %x3, %t1 + ret <32 x half> %t2 +} + +; Verify that SSE and AVX scalar half-precision minimum ops are reassociated. + +define half @reassociate_mins_half(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_mins_half: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vminsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fdiv half %x0, %x1 + %cmp1 = fcmp olt half %x2, %t0 + %sel1 = select i1 %cmp1, half %x2, half %t0 + %cmp2 = fcmp olt half %x3, %sel1 + %sel2 = select i1 %cmp2, half %x3, half %sel1 + ret half %sel2 +} + +; Verify that SSE and AVX scalar half-precision maximum ops are reassociated. + +define half @reassociate_maxs_half(half %x0, half %x1, half %x2, half %x3) { +; CHECK-LABEL: reassociate_maxs_half: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmaxsh %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fdiv half %x0, %x1 + %cmp1 = fcmp ogt half %x2, %t0 + %sel1 = select i1 %cmp1, half %x2, half %t0 + %cmp2 = fcmp ogt half %x3, %sel1 + %sel2 = select i1 %cmp2, half %x3, half %sel1 + ret half %sel2 +} + +; Verify that SSE and AVX 128-bit vector half-precision minimum ops are reassociated. + +define <8 x half> @reassociate_mins_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { +; CHECK-LABEL: reassociate_mins_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vminph %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vminph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd <8 x half> %x0, %x1 + %cmp1 = fcmp olt <8 x half> %x2, %t0 + %sel1 = select <8 x i1> %cmp1, <8 x half> %x2, <8 x half> %t0 + %cmp2 = fcmp olt <8 x half> %x3, %sel1 + %sel2 = select <8 x i1> %cmp2, <8 x half> %x3, <8 x half> %sel1 + ret <8 x half> %sel2 +} + +; Verify that SSE and AVX 128-bit vector half-precision maximum ops are reassociated. + +define <8 x half> @reassociate_maxs_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) { +; CHECK-LABEL: reassociate_maxs_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmaxph %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vmaxph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd <8 x half> %x0, %x1 + %cmp1 = fcmp ogt <8 x half> %x2, %t0 + %sel1 = select <8 x i1> %cmp1, <8 x half> %x2, <8 x half> %t0 + %cmp2 = fcmp ogt <8 x half> %x3, %sel1 + %sel2 = select <8 x i1> %cmp2, <8 x half> %x3, <8 x half> %sel1 + ret <8 x half> %sel2 +} + +; Verify that AVX 256-bit vector half-precision minimum ops are reassociated. + +define <16 x half> @reassociate_mins_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) { +; CHECK-LABEL: reassociate_mins_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vminph %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vminph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t0 = fadd <16 x half> %x0, %x1 + %cmp1 = fcmp olt <16 x half> %x2, %t0 + %sel1 = select <16 x i1> %cmp1, <16 x half> %x2, <16 x half> %t0 + %cmp2 = fcmp olt <16 x half> %x3, %sel1 + %sel2 = select <16 x i1> %cmp2, <16 x half> %x3, <16 x half> %sel1 + ret <16 x half> %sel2 +} + +; Verify that AVX 256-bit vector half-precision maximum ops are reassociated. + +define <16 x half> @reassociate_maxs_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) { +; CHECK-LABEL: reassociate_maxs_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmaxph %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vmaxph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %t0 = fadd <16 x half> %x0, %x1 + %cmp1 = fcmp ogt <16 x half> %x2, %t0 + %sel1 = select <16 x i1> %cmp1, <16 x half> %x2, <16 x half> %t0 + %cmp2 = fcmp ogt <16 x half> %x3, %sel1 + %sel2 = select <16 x i1> %cmp2, <16 x half> %x3, <16 x half> %sel1 + ret <16 x half> %sel2 +} + +; Verify that AVX512 512-bit vector half-precision minimum ops are reassociated. + +define <32 x half> @reassociate_mins_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) { +; CHECK-LABEL: reassociate_mins_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vminph %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vminph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t0 = fadd <32 x half> %x0, %x1 + %cmp1 = fcmp olt <32 x half> %x2, %t0 + %sel1 = select <32 x i1> %cmp1, <32 x half> %x2, <32 x half> %t0 + %cmp2 = fcmp olt <32 x half> %x3, %sel1 + %sel2 = select <32 x i1> %cmp2, <32 x half> %x3, <32 x half> %sel1 + ret <32 x half> %sel2 +} + +; Verify that AVX512 512-bit vector half-precision maximum ops are reassociated. + +define <32 x half> @reassociate_maxs_v16f32(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) { +; CHECK-LABEL: reassociate_maxs_v16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmaxph %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vmaxph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %t0 = fadd <32 x half> %x0, %x1 + %cmp1 = fcmp ogt <32 x half> %x2, %t0 + %sel1 = select <32 x i1> %cmp1, <32 x half> %x2, <32 x half> %t0 + %cmp2 = fcmp ogt <32 x half> %x3, %sel1 + %sel2 = select <32 x i1> %cmp2, <32 x half> %x3, <32 x half> %sel1 + ret <32 x half> %sel2 +} + diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll index 9af66e4f3b3d9..0b384a4d10c3a 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -1217,6 +1217,19 @@ define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) { ret <8 x half> %res } +define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: movsh: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11] +; CHECK-NEXT: vmovsh %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddph %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> + %res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> + %res = fadd <8 x half> %res1, %res2 + ret <8 x half> %res +} + define i16 @test_movw(half %x) { ; X64-LABEL: test_movw: ; X64: # %bb.0: @@ -1885,3 +1898,31 @@ define <4 x float> @regression2(i8 addrspace(1)* %0, <4 x i32> %1, <4 x i32> %2, %18 = fmul contract <4 x float> %17, ret <4 x float> %18 } + +; Make sure load/stores of v4f16 are handled well on 32-bit targets where +; default widening legalization can't use i64. +define void @load_store_v4f16(<4 x half>* %x, <4 x half>* %y, <4 x half>* %z) { +; X64-LABEL: load_store_v4f16: +; X64: # %bb.0: +; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; X64-NEXT: vmovlps %xmm0, (%rdx) +; X64-NEXT: retq +; +; X86-LABEL: load_store_v4f16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; X86-NEXT: vmovlps %xmm0, (%eax) +; X86-NEXT: retl + %a = load <4 x half>, <4 x half>* %x + %b = load <4 x half>, <4 x half>* %y + %c = fadd <4 x half> %a, %b + store <4 x half> %c, <4 x half>* %z + ret void +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll new file mode 100644 index 0000000000000..7da327b10d412 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll @@ -0,0 +1,141 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=+avx512fp16 | FileCheck %s --check-prefix=CHECK_UNSAFE +; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512fp16 | FileCheck %s --check-prefix=CHECK + +define <32 x half> @test_max_v32f16(<32 x half> * %a_ptr, <32 x half> %b) { +; CHECK_UNSAFE-LABEL: test_max_v32f16: +; CHECK_UNSAFE: # %bb.0: +; CHECK_UNSAFE-NEXT: vmaxph (%rdi), %zmm0, %zmm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_max_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmaxph %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %a = load <32 x half>, <32 x half>* %a_ptr + %tmp = fcmp fast ogt <32 x half> %a, %b + %tmp4 = select <32 x i1> %tmp, <32 x half> %a, <32 x half> %b + ret <32 x half> %tmp4; +} + +define <32 x half> @test_min_v32f16(<32 x half>* %a_ptr, <32 x half> %b) { +; CHECK_UNSAFE-LABEL: test_min_v32f16: +; CHECK_UNSAFE: # %bb.0: +; CHECK_UNSAFE-NEXT: vminph (%rdi), %zmm0, %zmm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_min_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vminph %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %a = load <32 x half>, <32 x half>* %a_ptr + %tmp = fcmp fast olt <32 x half> %a, %b + %tmp4 = select <32 x i1> %tmp, <32 x half> %a, <32 x half> %b + ret <32 x half> %tmp4; +} + +define <16 x half> @test_max_v16f16(<16 x half> * %a_ptr, <16 x half> %b) { +; CHECK_UNSAFE-LABEL: test_max_v16f16: +; CHECK_UNSAFE: # %bb.0: +; CHECK_UNSAFE-NEXT: vmaxph (%rdi), %ymm0, %ymm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_max_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vmaxph %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %a = load <16 x half>, <16 x half>* %a_ptr + %tmp = fcmp fast ogt <16 x half> %a, %b + %tmp4 = select <16 x i1> %tmp, <16 x half> %a, <16 x half> %b + ret <16 x half> %tmp4; +} + +define <16 x half> @test_min_v16f16(<16 x half>* %a_ptr, <16 x half> %b) { +; CHECK_UNSAFE-LABEL: test_min_v16f16: +; CHECK_UNSAFE: # %bb.0: +; CHECK_UNSAFE-NEXT: vminph (%rdi), %ymm0, %ymm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_min_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %ymm1 +; CHECK-NEXT: vminph %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %a = load <16 x half>, <16 x half>* %a_ptr + %tmp = fcmp fast olt <16 x half> %a, %b + %tmp4 = select <16 x i1> %tmp, <16 x half> %a, <16 x half> %b + ret <16 x half> %tmp4; +} + +define <8 x half> @test_max_v8f16(<8 x half> * %a_ptr, <8 x half> %b) { +; CHECK_UNSAFE-LABEL: test_max_v8f16: +; CHECK_UNSAFE: # %bb.0: +; CHECK_UNSAFE-NEXT: vmaxph (%rdi), %xmm0, %xmm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_max_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: vmaxph %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %a = load <8 x half>, <8 x half>* %a_ptr + %tmp = fcmp fast ogt <8 x half> %a, %b + %tmp4 = select <8 x i1> %tmp, <8 x half> %a, <8 x half> %b + ret <8 x half> %tmp4; +} + +define <8 x half> @test_min_v8f16(<8 x half>* %a_ptr, <8 x half> %b) { +; CHECK_UNSAFE-LABEL: test_min_v8f16: +; CHECK_UNSAFE: # %bb.0: +; CHECK_UNSAFE-NEXT: vminph (%rdi), %xmm0, %xmm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_min_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %xmm1 +; CHECK-NEXT: vminph %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %a = load <8 x half>, <8 x half>* %a_ptr + %tmp = fcmp fast olt <8 x half> %a, %b + %tmp4 = select <8 x i1> %tmp, <8 x half> %a, <8 x half> %b + ret <8 x half> %tmp4; +} + +define half @test_max_f16(half %a, half* %ptr) { +; CHECK_UNSAFE-LABEL: test_max_f16: +; CHECK_UNSAFE: # %bb.0: # %entry +; CHECK_UNSAFE-NEXT: vmaxsh (%rdi), %xmm0, %xmm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_max_f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovsh (%rdi), %xmm1 +; CHECK-NEXT: vmaxsh %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = load half, half* %ptr + %1 = fcmp fast ogt half %0, %a + %2 = select i1 %1, half %0, half %a + ret half %2 +} + +define half @test_min_f16(half %a, half* %ptr) { +; CHECK_UNSAFE-LABEL: test_min_f16: +; CHECK_UNSAFE: # %bb.0: # %entry +; CHECK_UNSAFE-NEXT: vminsh (%rdi), %xmm0, %xmm0 +; CHECK_UNSAFE-NEXT: retq +; +; CHECK-LABEL: test_min_f16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovsh (%rdi), %xmm1 +; CHECK-NEXT: vminsh %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = load half, half* %ptr + %1 = fcmp fast olt half %0, %a + %2 = select i1 %1, half %0, half %a + ret half %2 +} diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll new file mode 100644 index 0000000000000..dc70de2c57414 --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll @@ -0,0 +1,719 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK-64 + +define i32 @test_f16_oeq_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_oeq_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnel %eax, %ecx +; CHECK-32-NEXT: cmovpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_oeq_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnel %esi, %eax +; CHECK-64-NEXT: cmovpl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ogt_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ogt_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmoval %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ogt_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovbel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_oge_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_oge_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovael %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_oge_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovbl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_olt_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_olt_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmoval %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_olt_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmovbel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ole_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ole_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovael %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ole_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmovbl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_one_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_one_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_one_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ord_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ord_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ord_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovpl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ueq_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ueq_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ueq_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ugt_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ugt_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ugt_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmovael %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_uge_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_uge_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_uge_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmoval %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ult_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ult_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ult_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovael %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ule_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ule_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ule_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmoval %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_une_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_une_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnel %eax, %ecx +; CHECK-32-NEXT: cmovpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_une_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %esi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnel %edi, %eax +; CHECK-64-NEXT: cmovpl %edi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_uno_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_uno_q: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_uno_q: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnpl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmp.f16( + half %f1, half %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_oeq_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_oeq_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnel %eax, %ecx +; CHECK-32-NEXT: cmovpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_oeq_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnel %esi, %eax +; CHECK-64-NEXT: cmovpl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ogt_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ogt_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmoval %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ogt_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovbel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_oge_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_oge_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovael %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_oge_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovbl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_olt_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_olt_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmoval %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_olt_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmovbel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ole_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ole_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovael %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ole_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmovbl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_one_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_one_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_one_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ord_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ord_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ord_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovpl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ueq_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ueq_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ueq_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnel %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ugt_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ugt_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ugt_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmovael %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_uge_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_uge_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_uge_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm0, %xmm1 +; CHECK-64-NEXT: cmoval %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ult_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ult_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ult_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovael %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_ule_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_ule_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovbel %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_ule_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmoval %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_une_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_une_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovnel %eax, %ecx +; CHECK-32-NEXT: cmovpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_une_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %esi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnel %edi, %eax +; CHECK-64-NEXT: cmovpl %edi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define i32 @test_f16_uno_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; CHECK-32-LABEL: test_f16_uno_s: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vcomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: leal {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: cmovpl %eax, %ecx +; CHECK-32-NEXT: movl (%ecx), %eax +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: test_f16_uno_s: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: movl %edi, %eax +; CHECK-64-NEXT: vcomish %xmm1, %xmm0 +; CHECK-64-NEXT: cmovnpl %esi, %eax +; CHECK-64-NEXT: retq + %cond = call i1 @llvm.experimental.constrained.fcmps.f16( + half %f1, half %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select i1 %cond, i32 %a, i32 %b + ret i32 %res +} + +define void @foo(half %0, half %1) #0 { +; CHECK-32-LABEL: foo: +; CHECK-32: # %bb.0: +; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: vucomish {{[0-9]+}}(%esp), %xmm0 +; CHECK-32-NEXT: jbe .LBB28_1 +; CHECK-32-NEXT: # %bb.2: +; CHECK-32-NEXT: jmp bar@PLT # TAILCALL +; CHECK-32-NEXT: .LBB28_1: +; CHECK-32-NEXT: retl +; +; CHECK-64-LABEL: foo: +; CHECK-64: # %bb.0: +; CHECK-64-NEXT: vucomish %xmm1, %xmm0 +; CHECK-64-NEXT: jbe .LBB28_1 +; CHECK-64-NEXT: # %bb.2: +; CHECK-64-NEXT: jmp bar@PLT # TAILCALL +; CHECK-64-NEXT: .LBB28_1: +; CHECK-64-NEXT: retq + %3 = call i1 @llvm.experimental.constrained.fcmp.f16( half %0, half %1, metadata !"ogt", metadata !"fpexcept.strict") #0 + br i1 %3, label %4, label %5 + +4: ; preds = %2 + tail call void @bar() #0 + br label %5 + +5: ; preds = %4, %2 + ret void +} +declare void @bar() + +attributes #0 = { strictfp } + +declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadata) +declare i1 @llvm.experimental.constrained.fcmps.f16(half, half, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll new file mode 100644 index 0000000000000..9d58a262dcc6c --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64 + +declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata) +declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata) +declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata) +declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata) + +define half @fadd_f16(half %a, half %b) nounwind strictfp { +; X86-LABEL: fadd_f16: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vaddsh {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: fadd_f16: +; X64: # %bb.0: +; X64-NEXT: vaddsh %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq + %ret = call half @llvm.experimental.constrained.fadd.f16(half %a, half %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %ret +} + +define half @fsub_f16(half %a, half %b) nounwind strictfp { +; X86-LABEL: fsub_f16: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vsubsh {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: fsub_f16: +; X64: # %bb.0: +; X64-NEXT: vsubsh %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq + %ret = call half @llvm.experimental.constrained.fsub.f16(half %a, half %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %ret +} + +define half @fmul_f16(half %a, half %b) nounwind strictfp { +; X86-LABEL: fmul_f16: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmulsh {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: fmul_f16: +; X64: # %bb.0: +; X64-NEXT: vmulsh %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq + %ret = call half @llvm.experimental.constrained.fmul.f16(half %a, half %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %ret +} + +define half @fdiv_f16(half %a, half %b) nounwind strictfp { +; X86-LABEL: fdiv_f16: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vdivsh {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: fdiv_f16: +; X64: # %bb.0: +; X64-NEXT: vdivsh %xmm1, %xmm0, %xmm0 +; X64-NEXT: retq + %ret = call half @llvm.experimental.constrained.fdiv.f16(half %a, half %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret half %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll b/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll index 0d1ebff5347a7..92da9ffefde70 100644 --- a/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll +++ b/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll @@ -1,5 +1,68 @@ ; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+avx512fp16 -mattr=+avx512vl -o - | FileCheck %s +; This test checks that only a single je gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +; CHECK-LABEL: foo1: +; CHECK: je +; CHECK-NOT: je +define <8 x half> @foo1(i32 %v1, <8 x half> %v2, <8 x half> %v3, <8 x half> %v4) nounwind { +entry: + %cmp = icmp eq i32 %v1, 0 + %t1 = select i1 %cmp, <8 x half> %v2, <8 x half> %v3 + %t2 = select i1 %cmp, <8 x half> %v3, <8 x half> %v4 + %sub = fsub <8 x half> %t1, %t2 + ret <8 x half> %sub +} + +; This test checks that only a single ja gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. This combines +; all the supported types together into one long string of selects based +; on the same condition. +; CHECK-LABEL: foo2: +; CHECK: ja +; CHECK-NOT: ja +define void @foo2(i32 %v1, + half %v32, half %v33, + <8 x half> %v52, <8 x half> %v53, + <16 x half> %v122, <16 x half> %v123, + <32 x half> %v132, <32 x half> %v133, + i8 * %dst) nounwind { +entry: + %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 2 + %a31 = bitcast i8* %add.ptr31 to half* + + %add.ptr51 = getelementptr inbounds i8, i8* %dst, i32 4 + %a51 = bitcast i8* %add.ptr51 to <8 x half>* + + %add.ptr121 = getelementptr inbounds i8, i8* %dst, i32 20 + %a121 = bitcast i8* %add.ptr121 to <16 x half>* + + %add.ptr131 = getelementptr inbounds i8, i8* %dst, i32 52 + %a131 = bitcast i8* %add.ptr131 to <32 x half>* + + ; These operations are necessary, because select of two single use loads + ; ends up getting optimized into a select of two leas, followed by a + ; single load of the selected address. + + %t33 = fadd half %v33, %v32 + %t53 = fadd <8 x half> %v53, %v52 + %t123 = fadd <16 x half> %v123, %v122 + %t133 = fadd <32 x half> %v133, %v132 + + %cmp = icmp ugt i32 %v1, 31 + %t31 = select i1 %cmp, half %v32, half %t33 + %t51 = select i1 %cmp, <8 x half> %v52, <8 x half> %t53 + %t121 = select i1 %cmp, <16 x half> %v122, <16 x half> %t123 + %t131 = select i1 %cmp, <32 x half> %v132, <32 x half> %t133 + + store half %t31, half* %a31, align 2 + store <8 x half> %t51, <8 x half>* %a51, align 16 + store <16 x half> %t121, <16 x half>* %a121, align 32 + store <32 x half> %t131, <32 x half>* %a131, align 64 + + ret void +} + ; This test checks that only a single jne gets generated in the final code ; for lowering the CMOV pseudos that get created for this IR. define dso_local <32 x half> @foo3(<32 x half> %a, <32 x half> %b, i1 zeroext %sign) local_unnamed_addr #0 { diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll new file mode 100644 index 0000000000000..222abba7d2f7c --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll @@ -0,0 +1,572 @@ +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; Stack reload folding tests. +; +; By including a nop call with sideeffects we can force a partial register spill of the +; relevant registers and check that the reload is correctly folded into the instruction. + +define <32 x half> @stack_fold_addph_zmm(<32 x half> %a0, <32 x half> %a1) { + ;CHECK-LABEL: stack_fold_addph_zmm + ;CHECK: vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <32 x half> %a0, %a1 + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_addph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) { + ;CHECK-LABEL: stack_fold_addph_zmm_k: + ;CHECK: vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <32 x half> %a0, %a1 + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_addph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) { + ;CHECK-LABEL: stack_fold_addph_zmm_k_commuted: + ;CHECK: vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <32 x half> %a1, %a0 + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_addph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) { + ;CHECK-LABEL: stack_fold_addph_zmm_kz + ;CHECK: vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <32 x half> %a1, %a0 + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define half @stack_fold_addsh(half %a0, half %a1) { + ;CHECK-LABEL: stack_fold_addsh + ;CHECK: vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd half %a0, %a1 + ret half %2 +} + +define <8 x half> @stack_fold_addsh_int(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_addsh_int + ;CHECK: vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = extractelement <8 x half> %a0, i32 0 + %3 = extractelement <8 x half> %a1, i32 0 + %4 = fadd half %2, %3 + %5 = insertelement <8 x half> %a0, half %4, i32 0 + ret <8 x half> %5 +} + +define i32 @stack_fold_cmpph(<32 x half> %a0, <32 x half> %a1) { + ;CHECK-LABEL: stack_fold_cmpph + ;CHECK: vcmpeqph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %res = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %a1, i32 0, <32 x i1> , i32 4) + %2 = bitcast <32 x i1> %res to i32 + ret i32 %2 +} +declare <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half>, <32 x half>, i32, <32 x i1>, i32) + +define <32 x half> @stack_fold_cmpph_mask(<32 x half> %a0, <32 x half> %a1, <32 x half>* %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) { + ;CHECK-LABEL: stack_fold_cmpph_mask: + ;CHECK: vcmpeqph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load + %2 = load <32 x half>, <32 x half>* %a2 + %3 = fadd <32 x half> %a1, %2 + %4 = bitcast i32 %mask to <32 x i1> + %5 = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %3, <32 x half> %a0, i32 0, <32 x i1> , i32 4) + %6 = and <32 x i1> %4, %5 + %7 = select <32 x i1> %6, <32 x half> %b0, <32 x half> %b1 + ret <32 x half> %7 +} + +define <32 x half> @stack_fold_cmpph_mask_commuted(<32 x half> %a0, <32 x half> %a1, <32 x half>* %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) { + ;CHECK-LABEL: stack_fold_cmpph_mask_commuted: + ;CHECK: vcmpeqph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load + %2 = load <32 x half>, <32 x half>* %a2 + %3 = fadd <32 x half> %a1, %2 + %4 = bitcast i32 %mask to <32 x i1> + %5 = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %3, i32 0, <32 x i1> , i32 4) + %6 = and <32 x i1> %4, %5 + %7 = select <32 x i1> %6, <32 x half> %b0, <32 x half> %b1 + ret <32 x half> %7 +} + +define half @stack_fold_divsh(half %a0, half %a1) { + ;CHECK-LABEL: stack_fold_divsh + ;CHECK: vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fdiv half %a0, %a1 + ret half %2 +} + +define <8 x half> @stack_fold_divsh_int(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_divsh_int + ;CHECK: vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = extractelement <8 x half> %a0, i32 0 + %3 = extractelement <8 x half> %a1, i32 0 + %4 = fdiv half %2, %3 + %5 = insertelement <8 x half> %a0, half %4, i32 0 + ret <8 x half> %5 +} + +define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_maxph_zmm: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + ret <32 x half> %2 +} +declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone + +define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commuted: + ;CHECK-NOT: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #0 { + ;CHECK-LABEL: stack_fold_maxph_zmm_k: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #0 { + ;CHECK-LABEL: stack_fold_maxph_zmm_k_commuted: + ;CHECK-NOT: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 { + ;CHECK-LABEL: stack_fold_maxph_zmm_kz: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 { + ;CHECK-LABEL: stack_fold_maxph_zmm_kz_commuted: + ;CHECK-NOT: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_maxph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commutable: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_maxph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_commuted: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_maxph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #1 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_k: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_maxph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #1 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_k_commuted: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_maxph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 { + ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz_commuted: + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define half @stack_fold_maxsh(half %a0, half %a1) #0 { + ;CHECK-LABEL: stack_fold_maxsh: + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp ogt half %a0, %a1 + %3 = select i1 %2, half %a0, half %a1 + ret half %3 +} + +define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 { + ;CHECK-LABEL: stack_fold_maxsh_commuted: + ;CHECK-NOT: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp ogt half %a1, %a0 + %3 = select i1 %2, half %a1, half %a0 + ret half %3 +} + +define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 { + ;CHECK-LABEL: stack_fold_maxsh_commutable: + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp ogt half %a0, %a1 + %3 = select i1 %2, half %a0, half %a1 + ret half %3 +} + +define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 { + ;CHECK-LABEL: stack_fold_maxsh_commutable_commuted: + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp ogt half %a1, %a0 + %3 = select i1 %2, half %a1, half %a0 + ret half %3 +} + +define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_maxsh_int: + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @stack_fold_maxsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, <8 x half>* %passthru) { + ;CHECK-LABEL: stack_fold_maxsh_mask: + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_maxsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) { + ;CHECK-LABEL: stack_fold_maxsh_maskz: + ;CHECK: vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4) + ret <8 x half> %2 +} + +define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_minph_zmm: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + ret <32 x half> %2 +} +declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone + +define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_minph_zmm_commuted: + ;CHECK-NOT: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #0 { + ;CHECK-LABEL: stack_fold_minph_zmm_k: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #0 { + ;CHECK-LABEL: stack_fold_minph_zmm_k_commuted: + ;CHECK-NOT: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 { + ;CHECK-LABEL: stack_fold_minph_zmm_kz: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 { + ;CHECK-LABEL: stack_fold_minph_zmm_kz_commuted: + ;CHECK-NOT: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_minph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_minph_zmm_commutable: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_minph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_minph_zmm_commutable_commuted: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_minph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #1 { + ;CHECK-LABEL: stack_fold_minph_zmm_commutable_k: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_minph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #1 { + ;CHECK-LABEL: stack_fold_minph_zmm_commutable_k_commuted: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_minph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 { + ;CHECK-LABEL: stack_fold_minph_zmm_commutable_kz: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 { + ;CHECK-LABEL: stack_fold_minph_zmm_commutable_kz_commuted: + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4) + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define half @stack_fold_minsh(half %a0, half %a1) #0 { + ;CHECK-LABEL: stack_fold_minsh: + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp olt half %a0, %a1 + %3 = select i1 %2, half %a0, half %a1 + ret half %3 +} + +define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 { + ;CHECK-LABEL: stack_fold_minsh_commuted: + ;CHECK-NOT: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp olt half %a1, %a0 + %3 = select i1 %2, half %a1, half %a0 + ret half %3 +} + +define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 { + ;CHECK-LABEL: stack_fold_minsh_commutable: + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp olt half %a0, %a1 + %3 = select i1 %2, half %a0, half %a1 + ret half %3 +} + +define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 { + ;CHECK-LABEL: stack_fold_minsh_commutable_commuted: + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fcmp olt half %a1, %a0 + %3 = select i1 %2, half %a1, half %a0 + ret half %3 +} + +define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_minsh_int: + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32) + +define <8 x half> @stack_fold_minsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, <8 x half>* %passthru) { + ;CHECK-LABEL: stack_fold_minsh_mask: + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <8 x half>, <8 x half>* %passthru + %3 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4) + ret <8 x half> %3 +} + +define <8 x half> @stack_fold_minsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) { + ;CHECK-LABEL: stack_fold_minsh_maskz: + ;CHECK: vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4) + ret <8 x half> %2 +} + +define <32 x half> @stack_fold_mulph_zmm(<32 x half> %a0, <32 x half> %a1) { + ;CHECK-LABEL: stack_fold_mulph_zmm + ;CHECK: vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul <32 x half> %a0, %a1 + ret <32 x half> %2 +} + +define <32 x half> @stack_fold_mulph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) { + ;CHECK-LABEL: stack_fold_mulph_zmm_k: + ;CHECK: vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul <32 x half> %a0, %a1 + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_mulph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) { + ;CHECK-LABEL: stack_fold_mulph_zmm_k_commuted: + ;CHECK: vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul <32 x half> %a1, %a0 + %3 = bitcast i32 %mask to <32 x i1> + %4 = load <32 x half>, <32 x half>* %passthru + %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4 + ret <32 x half> %5 +} + +define <32 x half> @stack_fold_mulph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) { + ;CHECK-LABEL: stack_fold_mulph_zmm_kz + ;CHECK: vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul <32 x half> %a1, %a0 + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer + ret <32 x half> %4 +} + +define half @stack_fold_mulsh(half %a0, half %a1) { + ;CHECK-LABEL: stack_fold_mulsh + ;CHECK-NOT: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul half %a0, %a1 + ret half %2 +} + +define <8 x half> @stack_fold_mulsh_int(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_mulsh_int + ;CHECK-NOT: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = extractelement <8 x half> %a0, i32 0 + %3 = extractelement <8 x half> %a1, i32 0 + %4 = fmul half %2, %3 + %5 = insertelement <8 x half> %a0, half %4, i32 0 + ret <8 x half> %5 +} + +define <32 x half> @stack_fold_subph_zmm(<32 x half> %a0, <32 x half> %a1) { + ;CHECK-LABEL: stack_fold_subph_zmm + ;CHECK: vsubph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fsub <32 x half> %a0, %a1 + ret <32 x half> %2 +} + +define half @stack_fold_subsh(half %a0, half %a1) { + ;CHECK-LABEL: stack_fold_subsh + ;CHECK: vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fsub half %a0, %a1 + ret half %2 +} + +define <8 x half> @stack_fold_subsh_int(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_subsh_int + ;CHECK: vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = extractelement <8 x half> %a0, i32 0 + %3 = extractelement <8 x half> %a1, i32 0 + %4 = fsub half %2, %3 + %5 = insertelement <8 x half> %a0, half %4, i32 0 + ret <8 x half> %5 +} + +attributes #0 = { "unsafe-fp-math"="false" } +attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll new file mode 100644 index 0000000000000..bd9706839943a --- /dev/null +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll @@ -0,0 +1,148 @@ +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512fp16 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; Stack reload folding tests. +; +; By including a nop call with sideeffects we can force a partial register spill of the +; relevant registers and check that the reload is correctly folded into the instruction. + +define <8 x half> @stack_fold_addph(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_addph + ;CHECK: vaddph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <8 x half> %a0, %a1 + ret <8 x half> %2 +} + +define <16 x half> @stack_fold_addph_ymm(<16 x half> %a0, <16 x half> %a1) { + ;CHECK-LABEL: stack_fold_addph_ymm + ;CHECK: vaddph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fadd <16 x half> %a0, %a1 + ret <16 x half> %2 +} + +define i8 @stack_fold_cmpph(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_cmpph + ;CHECK: vcmpeqph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %res = call <8 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.128(<8 x half> %a0, <8 x half> %a1, i32 0, <8 x i1> ) + %2 = bitcast <8 x i1> %res to i8 + ret i8 %2 +} +declare <8 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.128(<8 x half>, <8 x half>, i32, <8 x i1>) + +define i16 @stack_fold_cmpph_ymm(<16 x half> %a0, <16 x half> %a1) { + ;CHECK-LABEL: stack_fold_cmpph_ymm + ;CHECK: vcmpeqph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %res = call <16 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.256(<16 x half> %a0, <16 x half> %a1, i32 0, <16 x i1> ) + %2 = bitcast <16 x i1> %res to i16 + ret i16 %2 +} +declare <16 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.256(<16 x half>, <16 x half>, i32, <16 x i1>) + +define <8 x half> @stack_fold_divph(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_divph + ;CHECK: vdivph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fdiv <8 x half> %a0, %a1 + ret <8 x half> %2 +} + +define <16 x half> @stack_fold_divph_ymm(<16 x half> %a0, <16 x half> %a1) { + ;CHECK-LABEL: stack_fold_divph_ymm + ;CHECK: vdivph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fdiv <16 x half> %a0, %a1 + ret <16 x half> %2 +} + +define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_maxph + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %a0, <8 x half> %a1) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>) nounwind readnone + +define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_maxph_commutable + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %a0, <8 x half> %a1) + ret <8 x half> %2 +} + +define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_maxph_ymm + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %a0, <16 x half> %a1) + ret <16 x half> %2 +} +declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>) nounwind readnone + +define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_maxph_ymm_commutable + ;CHECK: vmaxph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %a0, <16 x half> %a1) + ret <16 x half> %2 +} + +define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_minph + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %a0, <8 x half> %a1) + ret <8 x half> %2 +} +declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>) nounwind readnone + +define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_minph_commutable + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %a0, <8 x half> %a1) + ret <8 x half> %2 +} + +define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 { + ;CHECK-LABEL: stack_fold_minph_ymm + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %a0, <16 x half> %a1) + ret <16 x half> %2 +} +declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>) nounwind readnone + +define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 { + ;CHECK-LABEL: stack_fold_minph_ymm_commutable + ;CHECK: vminph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %a0, <16 x half> %a1) + ret <16 x half> %2 +} + +define <8 x half> @stack_fold_mulph(<8 x half> %a0, <8 x half> %a1) { + ;CHECK-LABEL: stack_fold_mulph + ;CHECK: vmulph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul <8 x half> %a0, %a1 + ret <8 x half> %2 +} + +define <16 x half> @stack_fold_mulph_ymm(<16 x half> %a0, <16 x half> %a1) { + ;CHECK-LABEL: stack_fold_mulph_ymm + ;CHECK: vmulph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = fmul <16 x half> %a0, %a1 + ret <16 x half> %2 +} + +attributes #0 = { "unsafe-fp-math"="false" } +attributes #1 = { "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll new file mode 100644 index 0000000000000..7657b769fa1b7 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK + +declare <8 x half> @llvm.experimental.constrained.fadd.v8f16(<8 x half>, <8 x half>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.fsub.v8f16(<8 x half>, <8 x half>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.fmul.v8f16(<8 x half>, <8 x half>, metadata, metadata) +declare <8 x half> @llvm.experimental.constrained.fdiv.v8f16(<8 x half>, <8 x half>, metadata, metadata) + +define <8 x half> @f2(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x half> @llvm.experimental.constrained.fadd.v8f16(<8 x half> %a, <8 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %ret +} + +define <8 x half> @f4(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x half> @llvm.experimental.constrained.fsub.v8f16(<8 x half> %a, <8 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %ret +} + +define <8 x half> @f6(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x half> @llvm.experimental.constrained.fmul.v8f16(<8 x half> %a, <8 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %ret +} + +define <8 x half> @f8(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x half> @llvm.experimental.constrained.fdiv.v8f16(<8 x half> %a, <8 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <8 x half> %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll new file mode 100644 index 0000000000000..d94003aab9daa --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s + +declare <16 x half> @llvm.experimental.constrained.fadd.v16f16(<16 x half>, <16 x half>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.fsub.v16f16(<16 x half>, <16 x half>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.fmul.v16f16(<16 x half>, <16 x half>, metadata, metadata) +declare <16 x half> @llvm.experimental.constrained.fdiv.v16f16(<16 x half>, <16 x half>, metadata, metadata) + +define <16 x half> @f2(<16 x half> %a, <16 x half> %b) #0 { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x half> @llvm.experimental.constrained.fadd.v16f16(<16 x half> %a, <16 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %ret +} + +define <16 x half> @f4(<16 x half> %a, <16 x half> %b) #0 { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x half> @llvm.experimental.constrained.fsub.v16f16(<16 x half> %a, <16 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %ret +} + +define <16 x half> @f6(<16 x half> %a, <16 x half> %b) #0 { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x half> @llvm.experimental.constrained.fmul.v16f16(<16 x half> %a, <16 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %ret +} + +define <16 x half> @f8(<16 x half> %a, <16 x half> %b) #0 { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x half> @llvm.experimental.constrained.fdiv.v16f16(<16 x half> %a, <16 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <16 x half> %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll new file mode 100644 index 0000000000000..4a5c8ca00b5f7 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s + +declare <32 x half> @llvm.experimental.constrained.fadd.v32f16(<32 x half>, <32 x half>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.fsub.v32f16(<32 x half>, <32 x half>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.fmul.v32f16(<32 x half>, <32 x half>, metadata, metadata) +declare <32 x half> @llvm.experimental.constrained.fdiv.v32f16(<32 x half>, <32 x half>, metadata, metadata) + +define <32 x half> @f2(<32 x half> %a, <32 x half> %b) #0 { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x half> @llvm.experimental.constrained.fadd.v32f16(<32 x half> %a, <32 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %ret +} + +define <32 x half> @f4(<32 x half> %a, <32 x half> %b) #0 { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsubph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x half> @llvm.experimental.constrained.fsub.v32f16(<32 x half> %a, <32 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %ret +} + +define <32 x half> @f6(<32 x half> %a, <32 x half> %b) #0 { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vmulph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x half> @llvm.experimental.constrained.fmul.v32f16(<32 x half> %a, <32 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %ret +} + +define <32 x half> @f8(<32 x half> %a, <32 x half> %b) #0 { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vdivph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <32 x half> @llvm.experimental.constrained.fdiv.v32f16(<32 x half> %a, <32 x half> %b, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret <32 x half> %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll new file mode 100644 index 0000000000000..da21aa68e7594 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll @@ -0,0 +1,1012 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64 + +define <8 x i16> @test_v8f16_oeq_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_oeq_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpeqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_oeq_q: +; X64: # %bb.0: +; X64-NEXT: vcmpeqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ogt_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ogt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpgt_oqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ogt_q: +; X64: # %bb.0: +; X64-NEXT: vcmplt_oqph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_oge_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_oge_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpge_oqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_oge_q: +; X64: # %bb.0: +; X64-NEXT: vcmple_oqph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_olt_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_olt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmplt_oqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_olt_q: +; X64: # %bb.0: +; X64-NEXT: vcmplt_oqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ole_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ole_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmple_oqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ole_q: +; X64: # %bb.0: +; X64-NEXT: vcmple_oqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_one_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_one_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpneq_oqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_one_q: +; X64: # %bb.0: +; X64-NEXT: vcmpneq_oqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ord_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ord_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpordph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ord_q: +; X64: # %bb.0: +; X64-NEXT: vcmpordph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ueq_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ueq_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpeq_uqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ueq_q: +; X64: # %bb.0: +; X64-NEXT: vcmpeq_uqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ugt_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ugt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpnle_uqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ugt_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnle_uqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_uge_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_uge_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpnlt_uqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_uge_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnlt_uqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ult_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ult_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpnge_uqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ult_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnle_uqph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ule_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ule_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpngt_uqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ule_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnlt_uqph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_une_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_une_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpneqph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_une_q: +; X64: # %bb.0: +; X64-NEXT: vcmpneqph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_uno_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_uno_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpunordph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_uno_q: +; X64: # %bb.0: +; X64-NEXT: vcmpunordph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_oeq_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_oeq_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpeq_osph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_oeq_s: +; X64: # %bb.0: +; X64-NEXT: vcmpeq_osph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ogt_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ogt_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpgtph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ogt_s: +; X64: # %bb.0: +; X64-NEXT: vcmpltph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_oge_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_oge_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpgeph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_oge_s: +; X64: # %bb.0: +; X64-NEXT: vcmpleph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_olt_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_olt_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpltph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_olt_s: +; X64: # %bb.0: +; X64-NEXT: vcmpltph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ole_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ole_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpleph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ole_s: +; X64: # %bb.0: +; X64-NEXT: vcmpleph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_one_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_one_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpneq_osph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_one_s: +; X64: # %bb.0: +; X64-NEXT: vcmpneq_osph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ord_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ord_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpord_sph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ord_s: +; X64: # %bb.0: +; X64-NEXT: vcmpord_sph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ueq_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ueq_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpeq_usph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ueq_s: +; X64: # %bb.0: +; X64-NEXT: vcmpeq_usph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ugt_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ugt_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpnleph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ugt_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnleph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_uge_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_uge_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpnltph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_uge_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnltph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ult_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ult_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpngeph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ult_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnleph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_ule_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_ule_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpngtph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_ule_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnltph %xmm2, %xmm3, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_une_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_une_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpneq_usph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_une_s: +; X64: # %bb.0: +; X64-NEXT: vcmpneq_usph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <8 x i16> @test_v8f16_uno_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 { +; X86-LABEL: test_v8f16_uno_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: vcmpunord_sph 8(%ebp), %xmm2, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v8f16_uno_s: +; X64: # %bb.0: +; X64-NEXT: vcmpunord_sph %xmm3, %xmm2, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16( + <8 x half> %f1, <8 x half> %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b + ret <8 x i16> %res +} + +define <2 x i16> @test_v2f16_oeq_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, <2 x half> %f2) #0 { +; X86-LABEL: test_v2f16_oeq_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movb $-3, %al +; X86-NEXT: kmovd %eax, %k0 +; X86-NEXT: vucomish 8(%ebp), %xmm2 +; X86-NEXT: setnp %al +; X86-NEXT: sete %cl +; X86-NEXT: testb %al, %cl +; X86-NEXT: setne %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k0, %k1, %k0 +; X86-NEXT: vpsrld $16, %xmm2, %xmm2 +; X86-NEXT: vucomish 10(%ebp), %xmm2 +; X86-NEXT: setnp %al +; X86-NEXT: sete %cl +; X86-NEXT: testb %al, %cl +; X86-NEXT: setne %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $6, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v2f16_oeq_q: +; X64: # %bb.0: +; X64-NEXT: vucomish %xmm3, %xmm2 +; X64-NEXT: setnp %al +; X64-NEXT: sete %cl +; X64-NEXT: testb %al, %cl +; X64-NEXT: setne %al +; X64-NEXT: kmovd %eax, %k0 +; X64-NEXT: movb $-3, %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k1, %k0, %k0 +; X64-NEXT: vpsrld $16, %xmm3, %xmm3 +; X64-NEXT: vpsrld $16, %xmm2, %xmm2 +; X64-NEXT: vucomish %xmm3, %xmm2 +; X64-NEXT: setnp %al +; X64-NEXT: sete %cl +; X64-NEXT: testb %al, %cl +; X64-NEXT: setne %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $6, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f16( + <2 x half> %f1, <2 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <2 x i1> %cond, <2 x i16> %a, <2 x i16> %b + ret <2 x i16> %res +} + +define <2 x i16> @test_v2f16_ogt_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, <2 x half> %f2) #0 { +; X86-LABEL: test_v2f16_ogt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movb $-3, %al +; X86-NEXT: kmovd %eax, %k0 +; X86-NEXT: vcomish 8(%ebp), %xmm2 +; X86-NEXT: seta %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k0, %k1, %k0 +; X86-NEXT: vpsrld $16, %xmm2, %xmm2 +; X86-NEXT: vcomish 10(%ebp), %xmm2 +; X86-NEXT: seta %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $6, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v2f16_ogt_q: +; X64: # %bb.0: +; X64-NEXT: movb $-3, %al +; X64-NEXT: kmovd %eax, %k0 +; X64-NEXT: vcomish %xmm3, %xmm2 +; X64-NEXT: seta %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k0, %k1, %k0 +; X64-NEXT: vpsrld $16, %xmm3, %xmm3 +; X64-NEXT: vpsrld $16, %xmm2, %xmm2 +; X64-NEXT: vcomish %xmm3, %xmm2 +; X64-NEXT: seta %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $6, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f16( + <2 x half> %f1, <2 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <2 x i1> %cond, <2 x i16> %a, <2 x i16> %b + ret <2 x i16> %res +} + +define <4 x i16> @test_v4f16_oge_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, <4 x half> %f2) #0 { +; X86-LABEL: test_v4f16_oge_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movb $-3, %al +; X86-NEXT: kmovd %eax, %k0 +; X86-NEXT: vucomish 8(%ebp), %xmm2 +; X86-NEXT: setae %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k0, %k1, %k0 +; X86-NEXT: vpsrld $16, %xmm2, %xmm3 +; X86-NEXT: vucomish 10(%ebp), %xmm3 +; X86-NEXT: setae %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $6, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k0 +; X86-NEXT: movb $-5, %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k1, %k0, %k0 +; X86-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; X86-NEXT: vucomish 12(%ebp), %xmm3 +; X86-NEXT: setae %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $5, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k0 +; X86-NEXT: movb $-9, %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k1, %k0, %k0 +; X86-NEXT: vpsrlq $48, %xmm2, %xmm2 +; X86-NEXT: vucomish 14(%ebp), %xmm2 +; X86-NEXT: setae %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $4, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v4f16_oge_q: +; X64: # %bb.0: +; X64-NEXT: movb $-3, %al +; X64-NEXT: kmovd %eax, %k0 +; X64-NEXT: vucomish %xmm3, %xmm2 +; X64-NEXT: setae %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k0, %k1, %k0 +; X64-NEXT: vpsrld $16, %xmm3, %xmm4 +; X64-NEXT: vpsrld $16, %xmm2, %xmm5 +; X64-NEXT: vucomish %xmm4, %xmm5 +; X64-NEXT: setae %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $6, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k0 +; X64-NEXT: movb $-5, %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k1, %k0, %k0 +; X64-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; X64-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; X64-NEXT: vucomish %xmm4, %xmm5 +; X64-NEXT: setae %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $5, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k0 +; X64-NEXT: movb $-9, %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k1, %k0, %k0 +; X64-NEXT: vpsrlq $48, %xmm3, %xmm3 +; X64-NEXT: vpsrlq $48, %xmm2, %xmm2 +; X64-NEXT: vucomish %xmm3, %xmm2 +; X64-NEXT: setae %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $4, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f16( + <4 x half> %f1, <4 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <4 x i1> %cond, <4 x i16> %a, <4 x i16> %b + ret <4 x i16> %res +} + +define <4 x i16> @test_v4f16_olt_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, <4 x half> %f2) #0 { +; X86-LABEL: test_v4f16_olt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movb $-3, %al +; X86-NEXT: kmovd %eax, %k0 +; X86-NEXT: vmovsh 8(%ebp), %xmm3 +; X86-NEXT: vcomish %xmm2, %xmm3 +; X86-NEXT: seta %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k0, %k1, %k0 +; X86-NEXT: vpsrld $16, %xmm2, %xmm3 +; X86-NEXT: vmovsh 10(%ebp), %xmm4 +; X86-NEXT: vcomish %xmm3, %xmm4 +; X86-NEXT: seta %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $6, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k0 +; X86-NEXT: movb $-5, %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k1, %k0, %k0 +; X86-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; X86-NEXT: vmovsh 12(%ebp), %xmm4 +; X86-NEXT: vcomish %xmm3, %xmm4 +; X86-NEXT: seta %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $5, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k0 +; X86-NEXT: movb $-9, %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kandb %k1, %k0, %k0 +; X86-NEXT: vpsrlq $48, %xmm2, %xmm2 +; X86-NEXT: vmovsh 14(%ebp), %xmm3 +; X86-NEXT: vcomish %xmm2, %xmm3 +; X86-NEXT: seta %al +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlb $7, %k1, %k1 +; X86-NEXT: kshiftrb $4, %k1, %k1 +; X86-NEXT: korb %k1, %k0, %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v4f16_olt_q: +; X64: # %bb.0: +; X64-NEXT: movb $-3, %al +; X64-NEXT: kmovd %eax, %k0 +; X64-NEXT: vcomish %xmm2, %xmm3 +; X64-NEXT: seta %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k0, %k1, %k0 +; X64-NEXT: vpsrld $16, %xmm2, %xmm4 +; X64-NEXT: vpsrld $16, %xmm3, %xmm5 +; X64-NEXT: vcomish %xmm4, %xmm5 +; X64-NEXT: seta %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $6, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k0 +; X64-NEXT: movb $-5, %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k1, %k0, %k0 +; X64-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] +; X64-NEXT: vmovshdup {{.*#+}} xmm5 = xmm3[1,1,3,3] +; X64-NEXT: vcomish %xmm4, %xmm5 +; X64-NEXT: seta %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $5, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k0 +; X64-NEXT: movb $-9, %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kandb %k1, %k0, %k0 +; X64-NEXT: vpsrlq $48, %xmm2, %xmm2 +; X64-NEXT: vpsrlq $48, %xmm3, %xmm3 +; X64-NEXT: vcomish %xmm2, %xmm3 +; X64-NEXT: seta %al +; X64-NEXT: kmovd %eax, %k1 +; X64-NEXT: kshiftlb $7, %k1, %k1 +; X64-NEXT: kshiftrb $4, %k1, %k1 +; X64-NEXT: korb %k1, %k0, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq + %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f16( + <4 x half> %f1, <4 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <4 x i1> %cond, <4 x i16> %a, <4 x i16> %b + ret <4 x i16> %res +} + +attributes #0 = { strictfp nounwind } + +declare <2 x i1> @llvm.experimental.constrained.fcmp.v2f16(<2 x half>, <2 x half>, metadata, metadata) +declare <2 x i1> @llvm.experimental.constrained.fcmps.v2f16(<2 x half>, <2 x half>, metadata, metadata) +declare <4 x i1> @llvm.experimental.constrained.fcmp.v4f16(<4 x half>, <4 x half>, metadata, metadata) +declare <4 x i1> @llvm.experimental.constrained.fcmps.v4f16(<4 x half>, <4 x half>, metadata, metadata) +declare <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(<8 x half>, <8 x half>, metadata, metadata) +declare <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(<8 x half>, <8 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-256-fp16.ll new file mode 100644 index 0000000000000..81987dca26567 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-256-fp16.ll @@ -0,0 +1,708 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX512-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX512-64 + +define <16 x i16> @test_v16f16_oeq_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_oeq_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpeqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_oeq_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpeqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ogt_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ogt_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpgt_oqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ogt_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmplt_oqph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_oge_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_oge_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpge_oqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_oge_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmple_oqph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_olt_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_olt_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmplt_oqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_olt_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmplt_oqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ole_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ole_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmple_oqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ole_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmple_oqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_one_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_one_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpneq_oqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_one_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpneq_oqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ord_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ord_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpordph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ord_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpordph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ueq_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ueq_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpeq_uqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ueq_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpeq_uqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ugt_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ugt_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpnle_uqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ugt_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnle_uqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_uge_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_uge_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpnlt_uqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_uge_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnlt_uqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ult_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ult_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpnge_uqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ult_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnle_uqph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ule_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ule_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpngt_uqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ule_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnlt_uqph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_une_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_une_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpneqph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_une_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpneqph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_uno_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_uno_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpunordph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_uno_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpunordph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_oeq_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_oeq_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpeq_osph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_oeq_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpeq_osph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ogt_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ogt_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpgtph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ogt_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpltph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_oge_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_oge_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpgeph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_oge_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpleph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_olt_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_olt_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpltph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_olt_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpltph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ole_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ole_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpleph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ole_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpleph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_one_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_one_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpneq_osph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_one_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpneq_osph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ord_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ord_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpord_sph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ord_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpord_sph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ueq_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ueq_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpeq_usph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ueq_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpeq_usph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ugt_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ugt_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpnleph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ugt_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnleph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_uge_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_uge_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpnltph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_uge_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnltph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ult_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ult_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpngeph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ult_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnleph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_ule_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_ule_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpngtph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_ule_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpnltph %ymm2, %ymm3, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_une_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_une_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpneq_usph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_une_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpneq_usph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +define <16 x i16> @test_v16f16_uno_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 { +; AVX512-32-LABEL: test_v16f16_uno_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-32, %esp +; AVX512-32-NEXT: subl $32, %esp +; AVX512-32-NEXT: vcmpunord_sph 8(%ebp), %ymm2, %k1 +; AVX512-32-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v16f16_uno_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vcmpunord_sph %ymm3, %ymm2, %k1 +; AVX512-64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16( + <16 x half> %f1, <16 x half> %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b + ret <16 x i16> %res +} + +attributes #0 = { strictfp nounwind } + +declare <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(<16 x half>, <16 x half>, metadata, metadata) +declare <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(<16 x half>, <16 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-512-fp16.ll new file mode 100644 index 0000000000000..bfeb41e9cf94e --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-512-fp16.ll @@ -0,0 +1,708 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X64 + +define <32 x i16> @test_v32f16_oeq_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_oeq_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpeqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_oeq_q: +; X64: # %bb.0: +; X64-NEXT: vcmpeqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ogt_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ogt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpgt_oqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ogt_q: +; X64: # %bb.0: +; X64-NEXT: vcmplt_oqph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_oge_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_oge_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpge_oqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_oge_q: +; X64: # %bb.0: +; X64-NEXT: vcmple_oqph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_olt_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_olt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmplt_oqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_olt_q: +; X64: # %bb.0: +; X64-NEXT: vcmplt_oqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ole_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ole_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmple_oqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ole_q: +; X64: # %bb.0: +; X64-NEXT: vcmple_oqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_one_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_one_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpneq_oqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_one_q: +; X64: # %bb.0: +; X64-NEXT: vcmpneq_oqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ord_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ord_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpordph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ord_q: +; X64: # %bb.0: +; X64-NEXT: vcmpordph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ueq_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ueq_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpeq_uqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ueq_q: +; X64: # %bb.0: +; X64-NEXT: vcmpeq_uqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ugt_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ugt_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpnle_uqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ugt_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnle_uqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_uge_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_uge_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpnlt_uqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_uge_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnlt_uqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ult_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ult_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpnge_uqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ult_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnle_uqph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ule_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ule_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpngt_uqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ule_q: +; X64: # %bb.0: +; X64-NEXT: vcmpnlt_uqph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_une_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_une_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpneqph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_une_q: +; X64: # %bb.0: +; X64-NEXT: vcmpneqph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_uno_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_uno_q: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpunordph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_uno_q: +; X64: # %bb.0: +; X64-NEXT: vcmpunordph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_oeq_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_oeq_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpeq_osph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_oeq_s: +; X64: # %bb.0: +; X64-NEXT: vcmpeq_osph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ogt_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ogt_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpgtph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ogt_s: +; X64: # %bb.0: +; X64-NEXT: vcmpltph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_oge_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_oge_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpgeph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_oge_s: +; X64: # %bb.0: +; X64-NEXT: vcmpleph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"oge", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_olt_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_olt_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpltph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_olt_s: +; X64: # %bb.0: +; X64-NEXT: vcmpltph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"olt", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ole_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ole_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpleph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ole_s: +; X64: # %bb.0: +; X64-NEXT: vcmpleph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ole", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_one_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_one_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpneq_osph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_one_s: +; X64: # %bb.0: +; X64-NEXT: vcmpneq_osph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"one", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ord_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ord_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpord_sph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ord_s: +; X64: # %bb.0: +; X64-NEXT: vcmpord_sph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ord", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ueq_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ueq_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpeq_usph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ueq_s: +; X64: # %bb.0: +; X64-NEXT: vcmpeq_usph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ueq", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ugt_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ugt_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpnleph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ugt_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnleph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ugt", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_uge_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_uge_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpnltph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_uge_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnltph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"uge", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ult_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ult_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpngeph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ult_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnleph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ult", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_ule_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_ule_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpngtph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_ule_s: +; X64: # %bb.0: +; X64-NEXT: vcmpnltph %zmm2, %zmm3, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"ule", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_une_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_une_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpneq_usph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_une_s: +; X64: # %bb.0: +; X64-NEXT: vcmpneq_usph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"une", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +define <32 x i16> @test_v32f16_uno_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 { +; X86-LABEL: test_v32f16_uno_s: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-64, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: vcmpunord_sph 8(%ebp), %zmm2, %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: test_v32f16_uno_s: +; X64: # %bb.0: +; X64-NEXT: vcmpunord_sph %zmm3, %zmm2, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq + %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16( + <32 x half> %f1, <32 x half> %f2, metadata !"uno", + metadata !"fpexcept.strict") #0 + %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b + ret <32 x i16> %res +} + +attributes #0 = { strictfp nounwind } + +declare <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(<32 x half>, <32 x half>, metadata, metadata) +declare <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(<32 x half>, <32 x half>, metadata, metadata) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll index d576fc074775e..b1d30090cc6d8 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -3,8 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512FP16 ; ; vXf32 @@ -416,21 +417,29 @@ define half @test_v2f16(<2 x half> %a0) nounwind { ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v2f16: -; AVX512: # %bb.0: -; AVX512-NEXT: movzwl %si, %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: movzwl %di, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm0, %xmm1 -; AVX512-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: cmoval %edi, %esi -; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v2f16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movzwl %si, %eax +; AVX512BW-NEXT: vmovd %eax, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: movzwl %di, %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vucomiss %xmm0, %xmm1 +; AVX512BW-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; AVX512BW-NEXT: cmoval %edi, %esi +; AVX512BW-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; AVX512BW-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: retq +; +; AVX512FP16-LABEL: test_v2f16: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512FP16-NEXT: vcmpltph %xmm0, %xmm1, %k1 +; AVX512FP16-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1} +; AVX512FP16-NEXT: vmovaps %xmm1, %xmm0 +; AVX512FP16-NEXT: retq %1 = call nnan half @llvm.vector.reduce.fmax.v2f16(<2 x half> %a0) ret half %1 } diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll index 0e96da412ef1d..50c805d37ddb7 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -3,8 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512FP16 ; ; vXf32 @@ -415,21 +416,29 @@ define half @test_v2f16(<2 x half> %a0) nounwind { ; AVX-NEXT: popq %rbp ; AVX-NEXT: retq ; -; AVX512-LABEL: test_v2f16: -; AVX512: # %bb.0: -; AVX512-NEXT: movzwl %si, %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: movzwl %di, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm0, %xmm1 -; AVX512-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: cmovbl %edi, %esi -; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v2f16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movzwl %si, %eax +; AVX512BW-NEXT: vmovd %eax, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: movzwl %di, %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vucomiss %xmm0, %xmm1 +; AVX512BW-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; AVX512BW-NEXT: cmovbl %edi, %esi +; AVX512BW-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; AVX512BW-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512BW-NEXT: retq +; +; AVX512FP16-LABEL: test_v2f16: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512FP16-NEXT: vcmpltph %xmm1, %xmm0, %k1 +; AVX512FP16-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1} +; AVX512FP16-NEXT: vmovaps %xmm1, %xmm0 +; AVX512FP16-NEXT: retq %1 = call nnan half @llvm.vector.reduce.fmin.v2f16(<2 x half> %a0) ret half %1 } diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt index 39aa2e92d9f4f..ea8db82f89a2b 100644 --- a/llvm/test/MC/Disassembler/X86/avx512fp16.txt +++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt @@ -76,3 +76,387 @@ # ATT: vmovw %xmm30, -256(%rdx) # INTEL: vmovw word ptr [rdx - 256], xmm30 0x62,0x65,0x7d,0x08,0x7e,0x72,0x80 + +# ATT: vaddph %zmm28, %zmm29, %zmm30 +# INTEL: vaddph zmm30, zmm29, zmm28 +0x62,0x05,0x14,0x40,0x58,0xf4 + +# ATT: vaddph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vaddph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x05,0x14,0x10,0x58,0xf4 + +# ATT: vaddph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vaddph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x47,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vaddph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vaddph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x45,0x14,0x50,0x58,0x31 + +# ATT: vaddph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vaddph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x65,0x14,0x40,0x58,0x71,0x7f + +# ATT: vaddph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vaddph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x65,0x14,0xd7,0x58,0x72,0x80 + +# ATT: vaddsh %xmm28, %xmm29, %xmm30 +# INTEL: vaddsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x58,0xf4 + +# ATT: vaddsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vaddsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x05,0x16,0x10,0x58,0xf4 + +# ATT: vaddsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vaddsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vaddsh (%r9), %xmm29, %xmm30 +# INTEL: vaddsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x58,0x31 + +# ATT: vaddsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vaddsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x58,0x71,0x7f + +# ATT: vaddsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vaddsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x58,0x72,0x80 + +# ATT: vcmpeqph %zmm28, %zmm29, %k5 +# INTEL: vcmpph k5, zmm29, zmm28, 0 +0x62,0x93,0x14,0x40,0xc2,0xec,0x00 + +# ATT: vcmpleph {sae}, %zmm28, %zmm29, %k5 +# INTEL: vcmpph k5, zmm29, zmm28, {sae}, 2 +0x62,0x93,0x14,0x10,0xc2,0xec,0x02 + +# ATT: vcmpneqph 268435456(%rbp,%r14,8), %zmm29, %k5 {%k7} +# INTEL: vcmpph k5 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456], 4 +0x62,0xb3,0x14,0x47,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x04 + +# ATT: vcmpnleph (%r9){1to32}, %zmm29, %k5 +# INTEL: vcmpph k5, zmm29, word ptr [r9]{1to32}, 6 +0x62,0xd3,0x14,0x50,0xc2,0x29,0x06 + +# ATT: vcmpeq_uqph 8128(%rcx), %zmm29, %k5 +# INTEL: vcmpph k5, zmm29, zmmword ptr [rcx + 8128], 8 +0x62,0xf3,0x14,0x40,0xc2,0x69,0x7f,0x08 + +# ATT: vcmpngtph -256(%rdx){1to32}, %zmm29, %k5 {%k7} +# INTEL: vcmpph k5 {k7}, zmm29, word ptr [rdx - 256]{1to32}, 10 +0x62,0xf3,0x14,0x57,0xc2,0x6a,0x80,0x0a + +# ATT: vcmpneq_oqsh %xmm28, %xmm29, %k5 +# INTEL: vcmpsh k5, xmm29, xmm28, 12 +0x62,0x93,0x16,0x00,0xc2,0xec,0x0c + +# ATT: vcmpgtsh {sae}, %xmm28, %xmm29, %k5 +# INTEL: vcmpsh k5, xmm29, xmm28, {sae}, 14 +0x62,0x93,0x16,0x10,0xc2,0xec,0x0e + +# ATT: vcmpeq_ossh 268435456(%rbp,%r14,8), %xmm29, %k5 {%k7} +# INTEL: vcmpsh k5 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456], 16 +0x62,0xb3,0x16,0x07,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x10 + +# ATT: vcmple_oqsh (%r9), %xmm29, %k5 +# INTEL: vcmpsh k5, xmm29, word ptr [r9], 18 +0x62,0xd3,0x16,0x00,0xc2,0x29,0x12 + +# ATT: vcmpneq_ussh 254(%rcx), %xmm29, %k5 +# INTEL: vcmpsh k5, xmm29, word ptr [rcx + 254], 20 +0x62,0xf3,0x16,0x00,0xc2,0x69,0x7f,0x14 + +# ATT: vcmpnle_uqsh -256(%rdx), %xmm29, %k5 {%k7} +# INTEL: vcmpsh k5 {k7}, xmm29, word ptr [rdx - 256], 22 +0x62,0xf3,0x16,0x07,0xc2,0x6a,0x80,0x16 + +# ATT: vcomish %xmm29, %xmm30 +# INTEL: vcomish xmm30, xmm29 +0x62,0x05,0x7c,0x08,0x2f,0xf5 + +# ATT: vcomish {sae}, %xmm29, %xmm30 +# INTEL: vcomish xmm30, xmm29, {sae} +0x62,0x05,0x7c,0x18,0x2f,0xf5 + +# ATT: vcomish 268435456(%rbp,%r14,8), %xmm30 +# INTEL: vcomish xmm30, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7c,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vcomish (%r9), %xmm30 +# INTEL: vcomish xmm30, word ptr [r9] +0x62,0x45,0x7c,0x08,0x2f,0x31 + +# ATT: vcomish 254(%rcx), %xmm30 +# INTEL: vcomish xmm30, word ptr [rcx + 254] +0x62,0x65,0x7c,0x08,0x2f,0x71,0x7f + +# ATT: vcomish -256(%rdx), %xmm30 +# INTEL: vcomish xmm30, word ptr [rdx - 256] +0x62,0x65,0x7c,0x08,0x2f,0x72,0x80 + +# ATT: vdivph %zmm28, %zmm29, %zmm30 +# INTEL: vdivph zmm30, zmm29, zmm28 +0x62,0x05,0x14,0x40,0x5e,0xf4 + +# ATT: vdivph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vdivph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x05,0x14,0x10,0x5e,0xf4 + +# ATT: vdivph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vdivph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x47,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vdivph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vdivph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x45,0x14,0x50,0x5e,0x31 + +# ATT: vdivph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vdivph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x65,0x14,0x40,0x5e,0x71,0x7f + +# ATT: vdivph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vdivph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x65,0x14,0xd7,0x5e,0x72,0x80 + +# ATT: vdivsh %xmm28, %xmm29, %xmm30 +# INTEL: vdivsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x5e,0xf4 + +# ATT: vdivsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vdivsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x05,0x16,0x10,0x5e,0xf4 + +# ATT: vdivsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vdivsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vdivsh (%r9), %xmm29, %xmm30 +# INTEL: vdivsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x5e,0x31 + +# ATT: vdivsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vdivsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x5e,0x71,0x7f + +# ATT: vdivsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vdivsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x5e,0x72,0x80 + +# ATT: vmaxph %zmm28, %zmm29, %zmm30 +# INTEL: vmaxph zmm30, zmm29, zmm28 +0x62,0x05,0x14,0x40,0x5f,0xf4 + +# ATT: vmaxph {sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vmaxph zmm30, zmm29, zmm28, {sae} +0x62,0x05,0x14,0x10,0x5f,0xf4 + +# ATT: vmaxph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vmaxph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x47,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmaxph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vmaxph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x45,0x14,0x50,0x5f,0x31 + +# ATT: vmaxph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vmaxph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x65,0x14,0x40,0x5f,0x71,0x7f + +# ATT: vmaxph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vmaxph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x65,0x14,0xd7,0x5f,0x72,0x80 + +# ATT: vmaxsh %xmm28, %xmm29, %xmm30 +# INTEL: vmaxsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x5f,0xf4 + +# ATT: vmaxsh {sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vmaxsh xmm30, xmm29, xmm28, {sae} +0x62,0x05,0x16,0x10,0x5f,0xf4 + +# ATT: vmaxsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vmaxsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmaxsh (%r9), %xmm29, %xmm30 +# INTEL: vmaxsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x5f,0x31 + +# ATT: vmaxsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vmaxsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x5f,0x71,0x7f + +# ATT: vmaxsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vmaxsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x5f,0x72,0x80 + +# ATT: vminph %zmm28, %zmm29, %zmm30 +# INTEL: vminph zmm30, zmm29, zmm28 +0x62,0x05,0x14,0x40,0x5d,0xf4 + +# ATT: vminph {sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vminph zmm30, zmm29, zmm28, {sae} +0x62,0x05,0x14,0x10,0x5d,0xf4 + +# ATT: vminph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vminph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x47,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vminph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vminph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x45,0x14,0x50,0x5d,0x31 + +# ATT: vminph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vminph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x65,0x14,0x40,0x5d,0x71,0x7f + +# ATT: vminph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vminph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x65,0x14,0xd7,0x5d,0x72,0x80 + +# ATT: vminsh %xmm28, %xmm29, %xmm30 +# INTEL: vminsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x5d,0xf4 + +# ATT: vminsh {sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vminsh xmm30, xmm29, xmm28, {sae} +0x62,0x05,0x16,0x10,0x5d,0xf4 + +# ATT: vminsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vminsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vminsh (%r9), %xmm29, %xmm30 +# INTEL: vminsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x5d,0x31 + +# ATT: vminsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vminsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x5d,0x71,0x7f + +# ATT: vminsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vminsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x5d,0x72,0x80 + +# ATT: vmulph %zmm28, %zmm29, %zmm30 +# INTEL: vmulph zmm30, zmm29, zmm28 +0x62,0x05,0x14,0x40,0x59,0xf4 + +# ATT: vmulph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vmulph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x05,0x14,0x10,0x59,0xf4 + +# ATT: vmulph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vmulph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x47,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmulph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vmulph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x45,0x14,0x50,0x59,0x31 + +# ATT: vmulph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vmulph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x65,0x14,0x40,0x59,0x71,0x7f + +# ATT: vmulph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vmulph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x65,0x14,0xd7,0x59,0x72,0x80 + +# ATT: vmulsh %xmm28, %xmm29, %xmm30 +# INTEL: vmulsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x59,0xf4 + +# ATT: vmulsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vmulsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x05,0x16,0x10,0x59,0xf4 + +# ATT: vmulsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vmulsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmulsh (%r9), %xmm29, %xmm30 +# INTEL: vmulsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x59,0x31 + +# ATT: vmulsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vmulsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x59,0x71,0x7f + +# ATT: vmulsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vmulsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x59,0x72,0x80 + +# ATT: vsubph %zmm28, %zmm29, %zmm30 +# INTEL: vsubph zmm30, zmm29, zmm28 +0x62,0x05,0x14,0x40,0x5c,0xf4 + +# ATT: vsubph {rn-sae}, %zmm28, %zmm29, %zmm30 +# INTEL: vsubph zmm30, zmm29, zmm28, {rn-sae} +0x62,0x05,0x14,0x10,0x5c,0xf4 + +# ATT: vsubph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +# INTEL: vsubph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x14,0x47,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vsubph (%r9){1to32}, %zmm29, %zmm30 +# INTEL: vsubph zmm30, zmm29, word ptr [r9]{1to32} +0x62,0x45,0x14,0x50,0x5c,0x31 + +# ATT: vsubph 8128(%rcx), %zmm29, %zmm30 +# INTEL: vsubph zmm30, zmm29, zmmword ptr [rcx + 8128] +0x62,0x65,0x14,0x40,0x5c,0x71,0x7f + +# ATT: vsubph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +# INTEL: vsubph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32} +0x62,0x65,0x14,0xd7,0x5c,0x72,0x80 + +# ATT: vsubsh %xmm28, %xmm29, %xmm30 +# INTEL: vsubsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x5c,0xf4 + +# ATT: vsubsh {rn-sae}, %xmm28, %xmm29, %xmm30 +# INTEL: vsubsh xmm30, xmm29, xmm28, {rn-sae} +0x62,0x05,0x16,0x10,0x5c,0xf4 + +# ATT: vsubsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +# INTEL: vsubsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x16,0x07,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vsubsh (%r9), %xmm29, %xmm30 +# INTEL: vsubsh xmm30, xmm29, word ptr [r9] +0x62,0x45,0x16,0x00,0x5c,0x31 + +# ATT: vsubsh 254(%rcx), %xmm29, %xmm30 +# INTEL: vsubsh xmm30, xmm29, word ptr [rcx + 254] +0x62,0x65,0x16,0x00,0x5c,0x71,0x7f + +# ATT: vsubsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +# INTEL: vsubsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256] +0x62,0x65,0x16,0x87,0x5c,0x72,0x80 + +# ATT: vucomish %xmm29, %xmm30 +# INTEL: vucomish xmm30, xmm29 +0x62,0x05,0x7c,0x08,0x2e,0xf5 + +# ATT: vucomish {sae}, %xmm29, %xmm30 +# INTEL: vucomish xmm30, xmm29, {sae} +0x62,0x05,0x7c,0x18,0x2e,0xf5 + +# ATT: vucomish 268435456(%rbp,%r14,8), %xmm30 +# INTEL: vucomish xmm30, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7c,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vucomish (%r9), %xmm30 +# INTEL: vucomish xmm30, word ptr [r9] +0x62,0x45,0x7c,0x08,0x2e,0x31 + +# ATT: vucomish 254(%rcx), %xmm30 +# INTEL: vucomish xmm30, word ptr [rcx + 254] +0x62,0x65,0x7c,0x08,0x2e,0x71,0x7f + +# ATT: vucomish -256(%rdx), %xmm30 +# INTEL: vucomish xmm30, word ptr [rdx - 256] +0x62,0x65,0x7c,0x08,0x2e,0x72,0x80 diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt new file mode 100644 index 0000000000000..5f695c0bd3cef --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt @@ -0,0 +1,282 @@ +# RUN: llvm-mc --disassemble %s -triple=i686 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vaddph %ymm4, %ymm5, %ymm6 +# INTEL: vaddph ymm6, ymm5, ymm4 +0x62,0xf5,0x54,0x28,0x58,0xf4 + +# ATT: vaddph %xmm4, %xmm5, %xmm6 +# INTEL: vaddph xmm6, xmm5, xmm4 +0x62,0xf5,0x54,0x08,0x58,0xf4 + +# ATT: vaddph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vaddph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x2f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vaddph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vaddph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf5,0x54,0x38,0x58,0x31 + +# ATT: vaddph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vaddph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf5,0x54,0x28,0x58,0x71,0x7f + +# ATT: vaddph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vaddph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf5,0x54,0xbf,0x58,0x72,0x80 + +# ATT: vaddph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vaddph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x0f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vaddph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vaddph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf5,0x54,0x18,0x58,0x31 + +# ATT: vaddph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vaddph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf5,0x54,0x08,0x58,0x71,0x7f + +# ATT: vaddph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vaddph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf5,0x54,0x9f,0x58,0x72,0x80 + +# ATT: vcmpltph %ymm4, %ymm5, %k5 +# INTEL: vcmpph k5, ymm5, ymm4, 1 +0x62,0xf3,0x54,0x28,0xc2,0xec,0x01 + +# ATT: vcmpunordph %xmm4, %xmm5, %k5 +# INTEL: vcmpph k5, xmm5, xmm4, 3 +0x62,0xf3,0x54,0x08,0xc2,0xec,0x03 + +# ATT: vcmpnltph 268435456(%esp,%esi,8), %xmm5, %k5 {%k7} +# INTEL: vcmpph k5 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456], 5 +0x62,0xf3,0x54,0x0f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x05 + +# ATT: vcmpordph (%ecx){1to8}, %xmm5, %k5 +# INTEL: vcmpph k5, xmm5, word ptr [ecx]{1to8}, 7 +0x62,0xf3,0x54,0x18,0xc2,0x29,0x07 + +# ATT: vcmpngeph 2032(%ecx), %xmm5, %k5 +# INTEL: vcmpph k5, xmm5, xmmword ptr [ecx + 2032], 9 +0x62,0xf3,0x54,0x08,0xc2,0x69,0x7f,0x09 + +# ATT: vcmpfalseph -256(%edx){1to8}, %xmm5, %k5 {%k7} +# INTEL: vcmpph k5 {k7}, xmm5, word ptr [edx - 256]{1to8}, 11 +0x62,0xf3,0x54,0x1f,0xc2,0x6a,0x80,0x0b + +# ATT: vcmpgeph 268435456(%esp,%esi,8), %ymm5, %k5 {%k7} +# INTEL: vcmpph k5 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456], 13 +0x62,0xf3,0x54,0x2f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x0d + +# ATT: vcmptrueph (%ecx){1to16}, %ymm5, %k5 +# INTEL: vcmpph k5, ymm5, word ptr [ecx]{1to16}, 15 +0x62,0xf3,0x54,0x38,0xc2,0x29,0x0f + +# ATT: vcmplt_oqph 4064(%ecx), %ymm5, %k5 +# INTEL: vcmpph k5, ymm5, ymmword ptr [ecx + 4064], 17 +0x62,0xf3,0x54,0x28,0xc2,0x69,0x7f,0x11 + +# ATT: vcmpunord_sph -256(%edx){1to16}, %ymm5, %k5 {%k7} +# INTEL: vcmpph k5 {k7}, ymm5, word ptr [edx - 256]{1to16}, 19 +0x62,0xf3,0x54,0x3f,0xc2,0x6a,0x80,0x13 + +# ATT: vdivph %ymm4, %ymm5, %ymm6 +# INTEL: vdivph ymm6, ymm5, ymm4 +0x62,0xf5,0x54,0x28,0x5e,0xf4 + +# ATT: vdivph %xmm4, %xmm5, %xmm6 +# INTEL: vdivph xmm6, xmm5, xmm4 +0x62,0xf5,0x54,0x08,0x5e,0xf4 + +# ATT: vdivph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vdivph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x2f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vdivph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vdivph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf5,0x54,0x38,0x5e,0x31 + +# ATT: vdivph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vdivph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf5,0x54,0x28,0x5e,0x71,0x7f + +# ATT: vdivph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vdivph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf5,0x54,0xbf,0x5e,0x72,0x80 + +# ATT: vdivph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vdivph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x0f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vdivph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vdivph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf5,0x54,0x18,0x5e,0x31 + +# ATT: vdivph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vdivph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf5,0x54,0x08,0x5e,0x71,0x7f + +# ATT: vdivph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vdivph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf5,0x54,0x9f,0x5e,0x72,0x80 + +# ATT: vmaxph %ymm4, %ymm5, %ymm6 +# INTEL: vmaxph ymm6, ymm5, ymm4 +0x62,0xf5,0x54,0x28,0x5f,0xf4 + +# ATT: vmaxph %xmm4, %xmm5, %xmm6 +# INTEL: vmaxph xmm6, xmm5, xmm4 +0x62,0xf5,0x54,0x08,0x5f,0xf4 + +# ATT: vmaxph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vmaxph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x2f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmaxph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vmaxph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf5,0x54,0x38,0x5f,0x31 + +# ATT: vmaxph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vmaxph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf5,0x54,0x28,0x5f,0x71,0x7f + +# ATT: vmaxph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vmaxph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf5,0x54,0xbf,0x5f,0x72,0x80 + +# ATT: vmaxph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vmaxph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x0f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmaxph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vmaxph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf5,0x54,0x18,0x5f,0x31 + +# ATT: vmaxph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vmaxph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf5,0x54,0x08,0x5f,0x71,0x7f + +# ATT: vmaxph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vmaxph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf5,0x54,0x9f,0x5f,0x72,0x80 + +# ATT: vminph %ymm4, %ymm5, %ymm6 +# INTEL: vminph ymm6, ymm5, ymm4 +0x62,0xf5,0x54,0x28,0x5d,0xf4 + +# ATT: vminph %xmm4, %xmm5, %xmm6 +# INTEL: vminph xmm6, xmm5, xmm4 +0x62,0xf5,0x54,0x08,0x5d,0xf4 + +# ATT: vminph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vminph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x2f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vminph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vminph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf5,0x54,0x38,0x5d,0x31 + +# ATT: vminph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vminph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf5,0x54,0x28,0x5d,0x71,0x7f + +# ATT: vminph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vminph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf5,0x54,0xbf,0x5d,0x72,0x80 + +# ATT: vminph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vminph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x0f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vminph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vminph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf5,0x54,0x18,0x5d,0x31 + +# ATT: vminph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vminph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf5,0x54,0x08,0x5d,0x71,0x7f + +# ATT: vminph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vminph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf5,0x54,0x9f,0x5d,0x72,0x80 + +# ATT: vmulph %ymm4, %ymm5, %ymm6 +# INTEL: vmulph ymm6, ymm5, ymm4 +0x62,0xf5,0x54,0x28,0x59,0xf4 + +# ATT: vmulph %xmm4, %xmm5, %xmm6 +# INTEL: vmulph xmm6, xmm5, xmm4 +0x62,0xf5,0x54,0x08,0x59,0xf4 + +# ATT: vmulph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vmulph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x2f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmulph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vmulph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf5,0x54,0x38,0x59,0x31 + +# ATT: vmulph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vmulph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf5,0x54,0x28,0x59,0x71,0x7f + +# ATT: vmulph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vmulph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf5,0x54,0xbf,0x59,0x72,0x80 + +# ATT: vmulph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vmulph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x0f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vmulph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vmulph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf5,0x54,0x18,0x59,0x31 + +# ATT: vmulph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vmulph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf5,0x54,0x08,0x59,0x71,0x7f + +# ATT: vmulph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vmulph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf5,0x54,0x9f,0x59,0x72,0x80 + +# ATT: vsubph %ymm4, %ymm5, %ymm6 +# INTEL: vsubph ymm6, ymm5, ymm4 +0x62,0xf5,0x54,0x28,0x5c,0xf4 + +# ATT: vsubph %xmm4, %xmm5, %xmm6 +# INTEL: vsubph xmm6, xmm5, xmm4 +0x62,0xf5,0x54,0x08,0x5c,0xf4 + +# ATT: vsubph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +# INTEL: vsubph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x2f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vsubph (%ecx){1to16}, %ymm5, %ymm6 +# INTEL: vsubph ymm6, ymm5, word ptr [ecx]{1to16} +0x62,0xf5,0x54,0x38,0x5c,0x31 + +# ATT: vsubph 4064(%ecx), %ymm5, %ymm6 +# INTEL: vsubph ymm6, ymm5, ymmword ptr [ecx + 4064] +0x62,0xf5,0x54,0x28,0x5c,0x71,0x7f + +# ATT: vsubph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +# INTEL: vsubph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16} +0x62,0xf5,0x54,0xbf,0x5c,0x72,0x80 + +# ATT: vsubph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +# INTEL: vsubph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] +0x62,0xf5,0x54,0x0f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10 + +# ATT: vsubph (%ecx){1to8}, %xmm5, %xmm6 +# INTEL: vsubph xmm6, xmm5, word ptr [ecx]{1to8} +0x62,0xf5,0x54,0x18,0x5c,0x31 + +# ATT: vsubph 2032(%ecx), %xmm5, %xmm6 +# INTEL: vsubph xmm6, xmm5, xmmword ptr [ecx + 2032] +0x62,0xf5,0x54,0x08,0x5c,0x71,0x7f + +# ATT: vsubph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +# INTEL: vsubph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8} +0x62,0xf5,0x54,0x9f,0x5c,0x72,0x80 diff --git a/llvm/test/MC/X86/avx512fp16.s b/llvm/test/MC/X86/avx512fp16.s index 6118339d49d48..c45d0956faa1c 100644 --- a/llvm/test/MC/X86/avx512fp16.s +++ b/llvm/test/MC/X86/avx512fp16.s @@ -75,3 +75,387 @@ // CHECK: vmovw %xmm30, -256(%rdx) // CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7e,0x72,0x80] vmovw %xmm30, -256(%rdx) + +// CHECK: vaddph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x40,0x58,0xf4] + vaddph %zmm28, %zmm29, %zmm30 + +// CHECK: vaddph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x58,0xf4] + vaddph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vaddph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x47,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vaddph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x50,0x58,0x31] + vaddph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vaddph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x40,0x58,0x71,0x7f] + vaddph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vaddph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x58,0x72,0x80] + vaddph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vaddsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x58,0xf4] + vaddsh %xmm28, %xmm29, %xmm30 + +// CHECK: vaddsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x58,0xf4] + vaddsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vaddsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vaddsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x58,0x31] + vaddsh (%r9), %xmm29, %xmm30 + +// CHECK: vaddsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x58,0x71,0x7f] + vaddsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vaddsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x58,0x72,0x80] + vaddsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vcmpneq_usph %zmm28, %zmm29, %k5 +// CHECK: encoding: [0x62,0x93,0x14,0x40,0xc2,0xec,0x14] + vcmpneq_usph %zmm28, %zmm29, %k5 + +// CHECK: vcmpnlt_uqph {sae}, %zmm28, %zmm29, %k5 +// CHECK: encoding: [0x62,0x93,0x14,0x10,0xc2,0xec,0x15] + vcmpnlt_uqph {sae}, %zmm28, %zmm29, %k5 + +// CHECK: vcmpnle_uqph 268435456(%rbp,%r14,8), %zmm29, %k5 {%k7} +// CHECK: encoding: [0x62,0xb3,0x14,0x47,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x16] + vcmpnle_uqph 268435456(%rbp,%r14,8), %zmm29, %k5 {%k7} + +// CHECK: vcmpord_sph (%r9){1to32}, %zmm29, %k5 +// CHECK: encoding: [0x62,0xd3,0x14,0x50,0xc2,0x29,0x17] + vcmpord_sph (%r9){1to32}, %zmm29, %k5 + +// CHECK: vcmpeq_usph 8128(%rcx), %zmm29, %k5 +// CHECK: encoding: [0x62,0xf3,0x14,0x40,0xc2,0x69,0x7f,0x18] + vcmpeq_usph 8128(%rcx), %zmm29, %k5 + +// CHECK: vcmpnge_uqph -256(%rdx){1to32}, %zmm29, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x14,0x57,0xc2,0x6a,0x80,0x19] + vcmpnge_uqph -256(%rdx){1to32}, %zmm29, %k5 {%k7} + +// CHECK: vcmpngt_uqsh %xmm28, %xmm29, %k5 +// CHECK: encoding: [0x62,0x93,0x16,0x00,0xc2,0xec,0x1a] + vcmpngt_uqsh %xmm28, %xmm29, %k5 + +// CHECK: vcmpfalse_ossh {sae}, %xmm28, %xmm29, %k5 +// CHECK: encoding: [0x62,0x93,0x16,0x10,0xc2,0xec,0x1b] + vcmpfalse_ossh {sae}, %xmm28, %xmm29, %k5 + +// CHECK: vcmpneq_ossh 268435456(%rbp,%r14,8), %xmm29, %k5 {%k7} +// CHECK: encoding: [0x62,0xb3,0x16,0x07,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x1c] + vcmpneq_ossh 268435456(%rbp,%r14,8), %xmm29, %k5 {%k7} + +// CHECK: vcmpge_oqsh (%r9), %xmm29, %k5 +// CHECK: encoding: [0x62,0xd3,0x16,0x00,0xc2,0x29,0x1d] + vcmpge_oqsh (%r9), %xmm29, %k5 + +// CHECK: vcmpgt_oqsh 254(%rcx), %xmm29, %k5 +// CHECK: encoding: [0x62,0xf3,0x16,0x00,0xc2,0x69,0x7f,0x1e] + vcmpgt_oqsh 254(%rcx), %xmm29, %k5 + +// CHECK: vcmptrue_ussh -256(%rdx), %xmm29, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x16,0x07,0xc2,0x6a,0x80,0x1f] + vcmptrue_ussh -256(%rdx), %xmm29, %k5 {%k7} + +// CHECK: vcomish %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x2f,0xf5] + vcomish %xmm29, %xmm30 + +// CHECK: vcomish {sae}, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x2f,0xf5] + vcomish {sae}, %xmm29, %xmm30 + +// CHECK: vcomish 268435456(%rbp,%r14,8), %xmm30 +// CHECK: encoding: [0x62,0x25,0x7c,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vcomish 268435456(%rbp,%r14,8), %xmm30 + +// CHECK: vcomish (%r9), %xmm30 +// CHECK: encoding: [0x62,0x45,0x7c,0x08,0x2f,0x31] + vcomish (%r9), %xmm30 + +// CHECK: vcomish 254(%rcx), %xmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2f,0x71,0x7f] + vcomish 254(%rcx), %xmm30 + +// CHECK: vcomish -256(%rdx), %xmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2f,0x72,0x80] + vcomish -256(%rdx), %xmm30 + +// CHECK: vdivph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x40,0x5e,0xf4] + vdivph %zmm28, %zmm29, %zmm30 + +// CHECK: vdivph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x5e,0xf4] + vdivph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vdivph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x47,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vdivph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x50,0x5e,0x31] + vdivph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vdivph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x40,0x5e,0x71,0x7f] + vdivph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vdivph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x5e,0x72,0x80] + vdivph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vdivsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5e,0xf4] + vdivsh %xmm28, %xmm29, %xmm30 + +// CHECK: vdivsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5e,0xf4] + vdivsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vdivsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vdivsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5e,0x31] + vdivsh (%r9), %xmm29, %xmm30 + +// CHECK: vdivsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5e,0x71,0x7f] + vdivsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vdivsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5e,0x72,0x80] + vdivsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vmaxph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x40,0x5f,0xf4] + vmaxph %zmm28, %zmm29, %zmm30 + +// CHECK: vmaxph {sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x5f,0xf4] + vmaxph {sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vmaxph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x47,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vmaxph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x50,0x5f,0x31] + vmaxph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vmaxph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x40,0x5f,0x71,0x7f] + vmaxph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vmaxph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x5f,0x72,0x80] + vmaxph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vmaxsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5f,0xf4] + vmaxsh %xmm28, %xmm29, %xmm30 + +// CHECK: vmaxsh {sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5f,0xf4] + vmaxsh {sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vmaxsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vmaxsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5f,0x31] + vmaxsh (%r9), %xmm29, %xmm30 + +// CHECK: vmaxsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5f,0x71,0x7f] + vmaxsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vmaxsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5f,0x72,0x80] + vmaxsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vminph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x40,0x5d,0xf4] + vminph %zmm28, %zmm29, %zmm30 + +// CHECK: vminph {sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x5d,0xf4] + vminph {sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vminph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x47,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vminph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x50,0x5d,0x31] + vminph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vminph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x40,0x5d,0x71,0x7f] + vminph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vminph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x5d,0x72,0x80] + vminph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vminsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5d,0xf4] + vminsh %xmm28, %xmm29, %xmm30 + +// CHECK: vminsh {sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5d,0xf4] + vminsh {sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vminsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vminsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5d,0x31] + vminsh (%r9), %xmm29, %xmm30 + +// CHECK: vminsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5d,0x71,0x7f] + vminsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vminsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5d,0x72,0x80] + vminsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vmulph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x40,0x59,0xf4] + vmulph %zmm28, %zmm29, %zmm30 + +// CHECK: vmulph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x59,0xf4] + vmulph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vmulph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x47,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vmulph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x50,0x59,0x31] + vmulph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vmulph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x40,0x59,0x71,0x7f] + vmulph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vmulph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x59,0x72,0x80] + vmulph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vmulsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x59,0xf4] + vmulsh %xmm28, %xmm29, %xmm30 + +// CHECK: vmulsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x59,0xf4] + vmulsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vmulsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vmulsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x59,0x31] + vmulsh (%r9), %xmm29, %xmm30 + +// CHECK: vmulsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x59,0x71,0x7f] + vmulsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vmulsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x59,0x72,0x80] + vmulsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vsubph %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x40,0x5c,0xf4] + vsubph %zmm28, %zmm29, %zmm30 + +// CHECK: vsubph {rn-sae}, %zmm28, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x05,0x14,0x10,0x5c,0xf4] + vsubph {rn-sae}, %zmm28, %zmm29, %zmm30 + +// CHECK: vsubph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x14,0x47,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubph 268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7} + +// CHECK: vsubph (%r9){1to32}, %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x45,0x14,0x50,0x5c,0x31] + vsubph (%r9){1to32}, %zmm29, %zmm30 + +// CHECK: vsubph 8128(%rcx), %zmm29, %zmm30 +// CHECK: encoding: [0x62,0x65,0x14,0x40,0x5c,0x71,0x7f] + vsubph 8128(%rcx), %zmm29, %zmm30 + +// CHECK: vsubph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x5c,0x72,0x80] + vsubph -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z} + +// CHECK: vsubsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5c,0xf4] + vsubsh %xmm28, %xmm29, %xmm30 + +// CHECK: vsubsh {rn-sae}, %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5c,0xf4] + vsubsh {rn-sae}, %xmm28, %xmm29, %xmm30 + +// CHECK: vsubsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubsh 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7} + +// CHECK: vsubsh (%r9), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5c,0x31] + vsubsh (%r9), %xmm29, %xmm30 + +// CHECK: vsubsh 254(%rcx), %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5c,0x71,0x7f] + vsubsh 254(%rcx), %xmm29, %xmm30 + +// CHECK: vsubsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5c,0x72,0x80] + vsubsh -256(%rdx), %xmm29, %xmm30 {%k7} {z} + +// CHECK: vucomish %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x2e,0xf5] + vucomish %xmm29, %xmm30 + +// CHECK: vucomish {sae}, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x2e,0xf5] + vucomish {sae}, %xmm29, %xmm30 + +// CHECK: vucomish 268435456(%rbp,%r14,8), %xmm30 +// CHECK: encoding: [0x62,0x25,0x7c,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vucomish 268435456(%rbp,%r14,8), %xmm30 + +// CHECK: vucomish (%r9), %xmm30 +// CHECK: encoding: [0x62,0x45,0x7c,0x08,0x2e,0x31] + vucomish (%r9), %xmm30 + +// CHECK: vucomish 254(%rcx), %xmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2e,0x71,0x7f] + vucomish 254(%rcx), %xmm30 + +// CHECK: vucomish -256(%rdx), %xmm30 +// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2e,0x72,0x80] + vucomish -256(%rdx), %xmm30 diff --git a/llvm/test/MC/X86/avx512fp16vl.s b/llvm/test/MC/X86/avx512fp16vl.s new file mode 100644 index 0000000000000..e0ce1b996e906 --- /dev/null +++ b/llvm/test/MC/X86/avx512fp16vl.s @@ -0,0 +1,281 @@ +// RUN: llvm-mc -triple i686-unknown-unknown --show-encoding < %s | FileCheck %s + +// CHECK: vaddph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x58,0xf4] + vaddph %ymm4, %ymm5, %ymm6 + +// CHECK: vaddph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x58,0xf4] + vaddph %xmm4, %xmm5, %xmm6 + +// CHECK: vaddph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10] + vaddph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vaddph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x58,0x31] + vaddph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vaddph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x58,0x71,0x7f] + vaddph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vaddph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x58,0x72,0x80] + vaddph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vaddph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10] + vaddph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vaddph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x58,0x31] + vaddph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vaddph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x58,0x71,0x7f] + vaddph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vaddph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x58,0x72,0x80] + vaddph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vcmpeqph %ymm4, %ymm5, %k5 +// CHECK: encoding: [0x62,0xf3,0x54,0x28,0xc2,0xec,0x00] + vcmpph $0, %ymm4, %ymm5, %k5 + +// CHECK: vcmpltph %xmm4, %xmm5, %k5 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0xc2,0xec,0x01] + vcmpph $1, %xmm4, %xmm5, %k5 + +// CHECK: vcmpleph 268435456(%esp,%esi,8), %xmm5, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x54,0x0f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x02] + vcmpph $2, 268435456(%esp,%esi,8), %xmm5, %k5 {%k7} + +// CHECK: vcmpunordph (%ecx){1to8}, %xmm5, %k5 +// CHECK: encoding: [0x62,0xf3,0x54,0x18,0xc2,0x29,0x03] + vcmpph $3, (%ecx){1to8}, %xmm5, %k5 + +// CHECK: vcmpneqph 2032(%ecx), %xmm5, %k5 +// CHECK: encoding: [0x62,0xf3,0x54,0x08,0xc2,0x69,0x7f,0x04] + vcmpph $4, 2032(%ecx), %xmm5, %k5 + +// CHECK: vcmpnltph -256(%edx){1to8}, %xmm5, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x54,0x1f,0xc2,0x6a,0x80,0x05] + vcmpph $5, -256(%edx){1to8}, %xmm5, %k5 {%k7} + +// CHECK: vcmpnleph 268435456(%esp,%esi,8), %ymm5, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x54,0x2f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x06] + vcmpph $6, 268435456(%esp,%esi,8), %ymm5, %k5 {%k7} + +// CHECK: vcmpordph (%ecx){1to16}, %ymm5, %k5 +// CHECK: encoding: [0x62,0xf3,0x54,0x38,0xc2,0x29,0x07] + vcmpph $7, (%ecx){1to16}, %ymm5, %k5 + +// CHECK: vcmpeq_uqph 4064(%ecx), %ymm5, %k5 +// CHECK: encoding: [0x62,0xf3,0x54,0x28,0xc2,0x69,0x7f,0x08] + vcmpph $8, 4064(%ecx), %ymm5, %k5 + +// CHECK: vcmpngeph -256(%edx){1to16}, %ymm5, %k5 {%k7} +// CHECK: encoding: [0x62,0xf3,0x54,0x3f,0xc2,0x6a,0x80,0x09] + vcmpph $9, -256(%edx){1to16}, %ymm5, %k5 {%k7} + +// CHECK: vdivph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5e,0xf4] + vdivph %ymm4, %ymm5, %ymm6 + +// CHECK: vdivph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5e,0xf4] + vdivph %xmm4, %xmm5, %xmm6 + +// CHECK: vdivph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vdivph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vdivph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x5e,0x31] + vdivph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vdivph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5e,0x71,0x7f] + vdivph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vdivph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x5e,0x72,0x80] + vdivph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vdivph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vdivph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vdivph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5e,0x31] + vdivph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vdivph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5e,0x71,0x7f] + vdivph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vdivph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5e,0x72,0x80] + vdivph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vmaxph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5f,0xf4] + vmaxph %ymm4, %ymm5, %ymm6 + +// CHECK: vmaxph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5f,0xf4] + vmaxph %xmm4, %xmm5, %xmm6 + +// CHECK: vmaxph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmaxph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vmaxph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x5f,0x31] + vmaxph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vmaxph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5f,0x71,0x7f] + vmaxph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vmaxph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x5f,0x72,0x80] + vmaxph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vmaxph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmaxph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vmaxph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5f,0x31] + vmaxph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vmaxph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5f,0x71,0x7f] + vmaxph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vmaxph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5f,0x72,0x80] + vmaxph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vminph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5d,0xf4] + vminph %ymm4, %ymm5, %ymm6 + +// CHECK: vminph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5d,0xf4] + vminph %xmm4, %xmm5, %xmm6 + +// CHECK: vminph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vminph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vminph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x5d,0x31] + vminph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vminph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5d,0x71,0x7f] + vminph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vminph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x5d,0x72,0x80] + vminph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vminph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vminph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vminph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5d,0x31] + vminph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vminph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5d,0x71,0x7f] + vminph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vminph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5d,0x72,0x80] + vminph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vmulph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x59,0xf4] + vmulph %ymm4, %ymm5, %ymm6 + +// CHECK: vmulph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x59,0xf4] + vmulph %xmm4, %xmm5, %xmm6 + +// CHECK: vmulph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmulph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vmulph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x59,0x31] + vmulph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vmulph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x59,0x71,0x7f] + vmulph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vmulph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x59,0x72,0x80] + vmulph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vmulph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmulph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vmulph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x59,0x31] + vmulph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vmulph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x59,0x71,0x7f] + vmulph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vmulph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x59,0x72,0x80] + vmulph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} + +// CHECK: vsubph %ymm4, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5c,0xf4] + vsubph %ymm4, %ymm5, %ymm6 + +// CHECK: vsubph %xmm4, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5c,0xf4] + vsubph %xmm4, %xmm5, %xmm6 + +// CHECK: vsubph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vsubph 268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7} + +// CHECK: vsubph (%ecx){1to16}, %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x5c,0x31] + vsubph (%ecx){1to16}, %ymm5, %ymm6 + +// CHECK: vsubph 4064(%ecx), %ymm5, %ymm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5c,0x71,0x7f] + vsubph 4064(%ecx), %ymm5, %ymm6 + +// CHECK: vsubph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x5c,0x72,0x80] + vsubph -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z} + +// CHECK: vsubph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} +// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vsubph 268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7} + +// CHECK: vsubph (%ecx){1to8}, %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5c,0x31] + vsubph (%ecx){1to8}, %xmm5, %xmm6 + +// CHECK: vsubph 2032(%ecx), %xmm5, %xmm6 +// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5c,0x71,0x7f] + vsubph 2032(%ecx), %xmm5, %xmm6 + +// CHECK: vsubph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} +// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5c,0x72,0x80] + vsubph -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z} diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s index 0f3ea81b84021..ae6ab881efdfb 100644 --- a/llvm/test/MC/X86/intel-syntax-avx512fp16.s +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s @@ -75,3 +75,387 @@ // CHECK: vmovw word ptr [edx - 256], xmm6 // CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x72,0x80] vmovw word ptr [edx - 256], xmm6 + +// CHECK: vaddph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x58,0xf4] + vaddph zmm6, zmm5, zmm4 + +// CHECK: vaddph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x58,0xf4] + vaddph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vaddph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10] + vaddph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vaddph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x58,0x31] + vaddph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vaddph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x58,0x71,0x7f] + vaddph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vaddph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x58,0x72,0x80] + vaddph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vaddsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x58,0xf4] + vaddsh xmm6, xmm5, xmm4 + +// CHECK: vaddsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x58,0xf4] + vaddsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vaddsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10] + vaddsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vaddsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x58,0x31] + vaddsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vaddsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x58,0x71,0x7f] + vaddsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vaddsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x58,0x72,0x80] + vaddsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vcmpph k5, zmm5, zmm4, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x48,0xc2,0xec,0x7b] + vcmpph k5, zmm5, zmm4, 123 + +// CHECK: vcmpph k5, zmm5, zmm4, {sae}, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x18,0xc2,0xec,0x7b] + vcmpph k5, zmm5, zmm4, {sae}, 123 + +// CHECK: vcmpph k5 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x4f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vcmpph k5 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vcmpph k5, zmm5, word ptr [ecx]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x58,0xc2,0x29,0x7b] + vcmpph k5, zmm5, word ptr [ecx]{1to32}, 123 + +// CHECK: vcmpph k5, zmm5, zmmword ptr [ecx + 8128], 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x48,0xc2,0x69,0x7f,0x7b] + vcmpph k5, zmm5, zmmword ptr [ecx + 8128], 123 + +// CHECK: vcmpph k5 {k7}, zmm5, word ptr [edx - 256]{1to32}, 123 +// CHECK: encoding: [0x62,0xf3,0x54,0x5f,0xc2,0x6a,0x80,0x7b] + vcmpph k5 {k7}, zmm5, word ptr [edx - 256]{1to32}, 123 + +// CHECK: vcmpsh k5, xmm5, xmm4, 123 +// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0xec,0x7b] + vcmpsh k5, xmm5, xmm4, 123 + +// CHECK: vcmpsh k5, xmm5, xmm4, {sae}, 123 +// CHECK: encoding: [0x62,0xf3,0x56,0x18,0xc2,0xec,0x7b] + vcmpsh k5, xmm5, xmm4, {sae}, 123 + +// CHECK: vcmpsh k5 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123 +// CHECK: encoding: [0x62,0xf3,0x56,0x0f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] + vcmpsh k5 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123 + +// CHECK: vcmpsh k5, xmm5, word ptr [ecx], 123 +// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0x29,0x7b] + vcmpsh k5, xmm5, word ptr [ecx], 123 + +// CHECK: vcmpsh k5, xmm5, word ptr [ecx + 254], 123 +// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0x69,0x7f,0x7b] + vcmpsh k5, xmm5, word ptr [ecx + 254], 123 + +// CHECK: vcmpsh k5 {k7}, xmm5, word ptr [edx - 256], 123 +// CHECK: encoding: [0x62,0xf3,0x56,0x0f,0xc2,0x6a,0x80,0x7b] + vcmpsh k5 {k7}, xmm5, word ptr [edx - 256], 123 + +// CHECK: vcomish xmm6, xmm5 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0xf5] + vcomish xmm6, xmm5 + +// CHECK: vcomish xmm6, xmm5, {sae} +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x2f,0xf5] + vcomish xmm6, xmm5, {sae} + +// CHECK: vcomish xmm6, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0xb4,0xf4,0x00,0x00,0x00,0x10] + vcomish xmm6, word ptr [esp + 8*esi + 268435456] + +// CHECK: vcomish xmm6, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0x31] + vcomish xmm6, word ptr [ecx] + +// CHECK: vcomish xmm6, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0x71,0x7f] + vcomish xmm6, word ptr [ecx + 254] + +// CHECK: vcomish xmm6, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0x72,0x80] + vcomish xmm6, word ptr [edx - 256] + +// CHECK: vdivph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5e,0xf4] + vdivph zmm6, zmm5, zmm4 + +// CHECK: vdivph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5e,0xf4] + vdivph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vdivph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vdivph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vdivph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x5e,0x31] + vdivph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vdivph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5e,0x71,0x7f] + vdivph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vdivph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x5e,0x72,0x80] + vdivph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vdivsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5e,0xf4] + vdivsh xmm6, xmm5, xmm4 + +// CHECK: vdivsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5e,0xf4] + vdivsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vdivsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vdivsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vdivsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5e,0x31] + vdivsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vdivsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5e,0x71,0x7f] + vdivsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vdivsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5e,0x72,0x80] + vdivsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vmaxph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5f,0xf4] + vmaxph zmm6, zmm5, zmm4 + +// CHECK: vmaxph zmm6, zmm5, zmm4, {sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5f,0xf4] + vmaxph zmm6, zmm5, zmm4, {sae} + +// CHECK: vmaxph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmaxph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vmaxph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x5f,0x31] + vmaxph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vmaxph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5f,0x71,0x7f] + vmaxph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vmaxph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x5f,0x72,0x80] + vmaxph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vmaxsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5f,0xf4] + vmaxsh xmm6, xmm5, xmm4 + +// CHECK: vmaxsh xmm6, xmm5, xmm4, {sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5f,0xf4] + vmaxsh xmm6, xmm5, xmm4, {sae} + +// CHECK: vmaxsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmaxsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vmaxsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5f,0x31] + vmaxsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vmaxsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5f,0x71,0x7f] + vmaxsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vmaxsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5f,0x72,0x80] + vmaxsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vminph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5d,0xf4] + vminph zmm6, zmm5, zmm4 + +// CHECK: vminph zmm6, zmm5, zmm4, {sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5d,0xf4] + vminph zmm6, zmm5, zmm4, {sae} + +// CHECK: vminph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vminph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vminph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x5d,0x31] + vminph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vminph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5d,0x71,0x7f] + vminph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vminph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x5d,0x72,0x80] + vminph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vminsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5d,0xf4] + vminsh xmm6, xmm5, xmm4 + +// CHECK: vminsh xmm6, xmm5, xmm4, {sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5d,0xf4] + vminsh xmm6, xmm5, xmm4, {sae} + +// CHECK: vminsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10] + vminsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vminsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5d,0x31] + vminsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vminsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5d,0x71,0x7f] + vminsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vminsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5d,0x72,0x80] + vminsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vmulph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x59,0xf4] + vmulph zmm6, zmm5, zmm4 + +// CHECK: vmulph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x59,0xf4] + vmulph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vmulph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmulph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vmulph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x59,0x31] + vmulph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vmulph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x59,0x71,0x7f] + vmulph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vmulph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x59,0x72,0x80] + vmulph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vmulsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x59,0xf4] + vmulsh xmm6, xmm5, xmm4 + +// CHECK: vmulsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x59,0xf4] + vmulsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vmulsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmulsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vmulsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x59,0x31] + vmulsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vmulsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x59,0x71,0x7f] + vmulsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vmulsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x59,0x72,0x80] + vmulsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vsubph zmm6, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5c,0xf4] + vsubph zmm6, zmm5, zmm4 + +// CHECK: vsubph zmm6, zmm5, zmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5c,0xf4] + vsubph zmm6, zmm5, zmm4, {rn-sae} + +// CHECK: vsubph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vsubph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] + +// CHECK: vsubph zmm6, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x5c,0x31] + vsubph zmm6, zmm5, word ptr [ecx]{1to32} + +// CHECK: vsubph zmm6, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5c,0x71,0x7f] + vsubph zmm6, zmm5, zmmword ptr [ecx + 8128] + +// CHECK: vsubph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x5c,0x72,0x80] + vsubph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32} + +// CHECK: vsubsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5c,0xf4] + vsubsh xmm6, xmm5, xmm4 + +// CHECK: vsubsh xmm6, xmm5, xmm4, {rn-sae} +// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5c,0xf4] + vsubsh xmm6, xmm5, xmm4, {rn-sae} + +// CHECK: vsubsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10] + vsubsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] + +// CHECK: vsubsh xmm6, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5c,0x31] + vsubsh xmm6, xmm5, word ptr [ecx] + +// CHECK: vsubsh xmm6, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5c,0x71,0x7f] + vsubsh xmm6, xmm5, word ptr [ecx + 254] + +// CHECK: vsubsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5c,0x72,0x80] + vsubsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] + +// CHECK: vucomish xmm6, xmm5 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0xf5] + vucomish xmm6, xmm5 + +// CHECK: vucomish xmm6, xmm5, {sae} +// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x2e,0xf5] + vucomish xmm6, xmm5, {sae} + +// CHECK: vucomish xmm6, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vucomish xmm6, word ptr [esp + 8*esi + 268435456] + +// CHECK: vucomish xmm6, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0x31] + vucomish xmm6, word ptr [ecx] + +// CHECK: vucomish xmm6, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0x71,0x7f] + vucomish xmm6, word ptr [ecx + 254] + +// CHECK: vucomish xmm6, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0x72,0x80] + vucomish xmm6, word ptr [edx - 256] diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s new file mode 100644 index 0000000000000..161208fdb452e --- /dev/null +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s @@ -0,0 +1,281 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vaddph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x05,0x14,0x20,0x58,0xf4] + vaddph ymm30, ymm29, ymm28 + +// CHECK: vaddph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x58,0xf4] + vaddph xmm30, xmm29, xmm28 + +// CHECK: vaddph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x27,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vaddph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x14,0x30,0x58,0x31] + vaddph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vaddph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x14,0x20,0x58,0x71,0x7f] + vaddph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vaddph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x58,0x72,0x80] + vaddph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vaddph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10] + vaddph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vaddph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x14,0x10,0x58,0x31] + vaddph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vaddph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x58,0x71,0x7f] + vaddph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vaddph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x14,0x97,0x58,0x72,0x80] + vaddph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vcmpph k5, ymm29, ymm28, 123 +// CHECK: encoding: [0x62,0x93,0x14,0x20,0xc2,0xec,0x7b] + vcmpph k5, ymm29, ymm28, 123 + +// CHECK: vcmpph k5, xmm29, xmm28, 123 +// CHECK: encoding: [0x62,0x93,0x14,0x00,0xc2,0xec,0x7b] + vcmpph k5, xmm29, xmm28, 123 + +// CHECK: vcmpph k5 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xb3,0x14,0x07,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vcmpph k5 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vcmpph k5, xmm29, word ptr [r9]{1to8}, 123 +// CHECK: encoding: [0x62,0xd3,0x14,0x10,0xc2,0x29,0x7b] + vcmpph k5, xmm29, word ptr [r9]{1to8}, 123 + +// CHECK: vcmpph k5, xmm29, xmmword ptr [rcx + 2032], 123 +// CHECK: encoding: [0x62,0xf3,0x14,0x00,0xc2,0x69,0x7f,0x7b] + vcmpph k5, xmm29, xmmword ptr [rcx + 2032], 123 + +// CHECK: vcmpph k5 {k7}, xmm29, word ptr [rdx - 256]{1to8}, 123 +// CHECK: encoding: [0x62,0xf3,0x14,0x17,0xc2,0x6a,0x80,0x7b] + vcmpph k5 {k7}, xmm29, word ptr [rdx - 256]{1to8}, 123 + +// CHECK: vcmpph k5 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: encoding: [0x62,0xb3,0x14,0x27,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] + vcmpph k5 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456], 123 + +// CHECK: vcmpph k5, ymm29, word ptr [r9]{1to16}, 123 +// CHECK: encoding: [0x62,0xd3,0x14,0x30,0xc2,0x29,0x7b] + vcmpph k5, ymm29, word ptr [r9]{1to16}, 123 + +// CHECK: vcmpph k5, ymm29, ymmword ptr [rcx + 4064], 123 +// CHECK: encoding: [0x62,0xf3,0x14,0x20,0xc2,0x69,0x7f,0x7b] + vcmpph k5, ymm29, ymmword ptr [rcx + 4064], 123 + +// CHECK: vcmpph k5 {k7}, ymm29, word ptr [rdx - 256]{1to16}, 123 +// CHECK: encoding: [0x62,0xf3,0x14,0x37,0xc2,0x6a,0x80,0x7b] + vcmpph k5 {k7}, ymm29, word ptr [rdx - 256]{1to16}, 123 + +// CHECK: vdivph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x05,0x14,0x20,0x5e,0xf4] + vdivph ymm30, ymm29, ymm28 + +// CHECK: vdivph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x5e,0xf4] + vdivph xmm30, xmm29, xmm28 + +// CHECK: vdivph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x27,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vdivph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x14,0x30,0x5e,0x31] + vdivph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vdivph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x14,0x20,0x5e,0x71,0x7f] + vdivph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vdivph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x5e,0x72,0x80] + vdivph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vdivph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vdivph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vdivph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x14,0x10,0x5e,0x31] + vdivph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vdivph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x5e,0x71,0x7f] + vdivph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vdivph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x14,0x97,0x5e,0x72,0x80] + vdivph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vmaxph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x05,0x14,0x20,0x5f,0xf4] + vmaxph ymm30, ymm29, ymm28 + +// CHECK: vmaxph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x5f,0xf4] + vmaxph xmm30, xmm29, xmm28 + +// CHECK: vmaxph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x27,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmaxph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x14,0x30,0x5f,0x31] + vmaxph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vmaxph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x14,0x20,0x5f,0x71,0x7f] + vmaxph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vmaxph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x5f,0x72,0x80] + vmaxph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vmaxph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmaxph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmaxph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x14,0x10,0x5f,0x31] + vmaxph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vmaxph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x5f,0x71,0x7f] + vmaxph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vmaxph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x14,0x97,0x5f,0x72,0x80] + vmaxph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vminph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x05,0x14,0x20,0x5d,0xf4] + vminph ymm30, ymm29, ymm28 + +// CHECK: vminph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x5d,0xf4] + vminph xmm30, xmm29, xmm28 + +// CHECK: vminph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x27,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vminph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x14,0x30,0x5d,0x31] + vminph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vminph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x14,0x20,0x5d,0x71,0x7f] + vminph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vminph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x5d,0x72,0x80] + vminph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vminph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10] + vminph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vminph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x14,0x10,0x5d,0x31] + vminph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vminph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x5d,0x71,0x7f] + vminph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vminph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x14,0x97,0x5d,0x72,0x80] + vminph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vmulph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x05,0x14,0x20,0x59,0xf4] + vmulph ymm30, ymm29, ymm28 + +// CHECK: vmulph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x59,0xf4] + vmulph xmm30, xmm29, xmm28 + +// CHECK: vmulph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x27,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmulph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x14,0x30,0x59,0x31] + vmulph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vmulph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x14,0x20,0x59,0x71,0x7f] + vmulph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vmulph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x59,0x72,0x80] + vmulph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vmulph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmulph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vmulph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x14,0x10,0x59,0x31] + vmulph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vmulph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x59,0x71,0x7f] + vmulph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vmulph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x14,0x97,0x59,0x72,0x80] + vmulph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} + +// CHECK: vsubph ymm30, ymm29, ymm28 +// CHECK: encoding: [0x62,0x05,0x14,0x20,0x5c,0xf4] + vsubph ymm30, ymm29, ymm28 + +// CHECK: vsubph xmm30, xmm29, xmm28 +// CHECK: encoding: [0x62,0x05,0x14,0x00,0x5c,0xf4] + vsubph xmm30, xmm29, xmm28 + +// CHECK: vsubph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x27,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vsubph ymm30, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0x45,0x14,0x30,0x5c,0x31] + vsubph ymm30, ymm29, word ptr [r9]{1to16} + +// CHECK: vsubph ymm30, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0x65,0x14,0x20,0x5c,0x71,0x7f] + vsubph ymm30, ymm29, ymmword ptr [rcx + 4064] + +// CHECK: vsubph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x5c,0x72,0x80] + vsubph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16} + +// CHECK: vsubph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0x25,0x14,0x07,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10] + vsubph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] + +// CHECK: vsubph xmm30, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0x45,0x14,0x10,0x5c,0x31] + vsubph xmm30, xmm29, word ptr [r9]{1to8} + +// CHECK: vsubph xmm30, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0x65,0x14,0x00,0x5c,0x71,0x7f] + vsubph xmm30, xmm29, xmmword ptr [rcx + 2032] + +// CHECK: vsubph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0x65,0x14,0x97,0x5c,0x72,0x80] + vsubph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} From d4d2b0c682c55f22978ff0eed743ebe882b3e735 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Sat, 14 Aug 2021 17:41:14 -0700 Subject: [PATCH 049/700] sanitizer_common: support %l in format strings Currently we only support %z and %ll width modifiers, but surprisingly not %l. This makes it impossible to print longs (sizeof(long) not necessary equal to sizeof(size_t)). We had some printf's that printed longs with %zu, but that's wrong and now with __attribute__((format)) in place they are flagged by compiler. So we either have a choice of doing static_cast(long) everywhere or add %l. Adding %l looks better, that's a standard modifier. Reviewed By: vitalybuka Differential Revision: https://reviews.llvm.org/D108066 --- compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp | 10 +++++++--- .../sanitizer_common/tests/sanitizer_printf_test.cpp | 3 +++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp index e25f384bb6e37..0938aa833753a 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp @@ -128,7 +128,7 @@ static int AppendPointer(char **buff, const char *buff_end, u64 ptr_value) { int VSNPrintf(char *buff, int buff_length, const char *format, va_list args) { static const char *kPrintfFormatsHelp = - "Supported Printf formats: %([0-9]*)?(z|ll)?{d,u,x,X}; %p; " + "Supported Printf formats: %([0-9]*)?(z|l|ll)?{d,u,x,X}; %p; " "%[-]([0-9]*)?(\\.\\*)?s; %c\n"; RAW_CHECK(format); RAW_CHECK(buff_length > 0); @@ -160,9 +160,11 @@ int VSNPrintf(char *buff, int buff_length, } bool have_z = (*cur == 'z'); cur += have_z; - bool have_ll = !have_z && (cur[0] == 'l' && cur[1] == 'l'); + bool have_l = cur[0] == 'l' && cur[1] != 'l'; + cur += have_l; + bool have_ll = cur[0] == 'l' && cur[1] == 'l'; cur += have_ll * 2; - const bool have_length = have_z || have_ll; + const bool have_length = have_z || have_l || have_ll; const bool have_flags = have_width || have_length; // At the moment only %s supports precision and left-justification. CHECK(!((precision >= 0 || left_justified) && *cur != 's')); @@ -170,6 +172,7 @@ int VSNPrintf(char *buff, int buff_length, case 'd': { s64 dval = have_ll ? va_arg(args, s64) : have_z ? va_arg(args, sptr) + : have_l ? va_arg(args, long) : va_arg(args, int); result += AppendSignedDecimal(&buff, buff_end, dval, width, pad_with_zero); @@ -180,6 +183,7 @@ int VSNPrintf(char *buff, int buff_length, case 'X': { u64 uval = have_ll ? va_arg(args, u64) : have_z ? va_arg(args, uptr) + : have_l ? va_arg(args, unsigned long) : va_arg(args, unsigned); bool uppercase = (*cur == 'X'); result += AppendUnsigned(&buff, buff_end, uval, (*cur == 'u') ? 10 : 16, diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_printf_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_printf_test.cpp index d213d107c0195..01e81fb0b6df6 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_printf_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_printf_test.cpp @@ -115,6 +115,9 @@ TEST(Printf, MinMax) { TestAgainstLibc("%d-%d", INT_MIN, INT_MAX); TestAgainstLibc("%u-%u", 0, UINT_MAX); TestAgainstLibc("%x-%x", 0, UINT_MAX); + TestAgainstLibc("%ld-%ld", LONG_MIN, LONG_MAX); + TestAgainstLibc("%lu-%lu", 0, LONG_MAX); + TestAgainstLibc("%lx-%lx", 0, LONG_MAX); #if !defined(_WIN32) // %z* format doesn't seem to be supported by MSVS. TestAgainstLibc("%zd-%zd", LONG_MIN, LONG_MAX); From 705b1191aad3e46b72eac8c0dc965d408d6147d0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 14 Aug 2021 17:24:21 -0700 Subject: [PATCH 050/700] [X86] Add parentheses around casts in X86 intrinsic headers. Fixes PR51324. --- clang/lib/Headers/avx512bwintrin.h | 122 +- clang/lib/Headers/avx512dqintrin.h | 730 +++--- clang/lib/Headers/avx512erintrin.h | 204 +- clang/lib/Headers/avx512fintrin.h | 3072 +++++++++++------------ clang/lib/Headers/avx512vbmi2intrin.h | 96 +- clang/lib/Headers/avx512vlbwintrin.h | 188 +- clang/lib/Headers/avx512vldqintrin.h | 268 +- clang/lib/Headers/avx512vlintrin.h | 1106 ++++---- clang/lib/Headers/avx512vlvbmi2intrin.h | 192 +- clang/lib/Headers/avx512vlvnniintrin.h | 16 +- clang/lib/Headers/f16cintrin.h | 8 +- clang/lib/Headers/gfniintrin.h | 95 +- clang/lib/Headers/vpclmulqdqintrin.h | 12 +- clang/lib/Headers/xopintrin.h | 62 +- 14 files changed, 3085 insertions(+), 3086 deletions(-) diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index 4281a33d375c2..6aee8aed84871 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -178,16 +178,16 @@ _kadd_mask64(__mmask64 __A, __mmask64 __B) } #define _kshiftli_mask32(A, I) \ - (__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I)) + ((__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I))) #define _kshiftri_mask32(A, I) \ - (__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I)) + ((__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I))) #define _kshiftli_mask64(A, I) \ - (__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I)) + ((__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I))) #define _kshiftri_mask64(A, I) \ - (__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I)) + ((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I))) static __inline__ unsigned int __DEFAULT_FN_ATTRS _cvtmask32_u32(__mmask32 __A) { @@ -232,44 +232,44 @@ _store_mask64(__mmask64 *__A, __mmask64 __B) { /* Integer compare */ #define _mm512_cmp_epi8_mask(a, b, p) \ - (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)-1) + ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)-1)) #define _mm512_mask_cmp_epi8_mask(m, a, b, p) \ - (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)(m)) + ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)(m))) #define _mm512_cmp_epu8_mask(a, b, p) \ - (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)-1) + ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)-1)) #define _mm512_mask_cmp_epu8_mask(m, a, b, p) \ - (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)(m)) + ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)(m))) #define _mm512_cmp_epi16_mask(a, b, p) \ - (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)-1) + ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)-1)) #define _mm512_mask_cmp_epi16_mask(m, a, b, p) \ - (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)(m)) + ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)(m))) #define _mm512_cmp_epu16_mask(a, b, p) \ - (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)-1) + ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)-1)) #define _mm512_mask_cmp_epu16_mask(m, a, b, p) \ - (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)(m)) + ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)(m))) #define _mm512_cmpeq_epi8_mask(A, B) \ _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) @@ -1428,36 +1428,36 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) #define _mm512_shufflehi_epi16(A, imm) \ - (__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm)) + ((__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm))) #define _mm512_mask_shufflehi_epi16(W, U, A, imm) \ - (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shufflehi_epi16((A), \ - (imm)), \ - (__v32hi)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shufflehi_epi16((A), \ + (imm)), \ + (__v32hi)(__m512i)(W))) #define _mm512_maskz_shufflehi_epi16(U, A, imm) \ - (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shufflehi_epi16((A), \ - (imm)), \ - (__v32hi)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shufflehi_epi16((A), \ + (imm)), \ + (__v32hi)_mm512_setzero_si512())) #define _mm512_shufflelo_epi16(A, imm) \ - (__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm)) + ((__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm))) #define _mm512_mask_shufflelo_epi16(W, U, A, imm) \ - (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shufflelo_epi16((A), \ - (imm)), \ - (__v32hi)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shufflelo_epi16((A), \ + (imm)), \ + (__v32hi)(__m512i)(W))) #define _mm512_maskz_shufflelo_epi16(U, A, imm) \ - (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shufflelo_epi16((A), \ - (imm)), \ - (__v32hi)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shufflelo_epi16((A), \ + (imm)), \ + (__v32hi)_mm512_setzero_si512())) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sllv_epi16(__m512i __A, __m512i __B) @@ -1527,7 +1527,7 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B) } #define _mm512_bslli_epi128(a, imm) \ - (__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)) + ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srlv_epi16(__m512i __A, __m512i __B) @@ -1664,7 +1664,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) } #define _mm512_bsrli_epi128(a, imm) \ - (__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)) + ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) @@ -1984,32 +1984,32 @@ _mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, } #define _mm512_alignr_epi8(A, B, N) \ - (__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), (int)(N)) + ((__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), (int)(N))) #define _mm512_mask_alignr_epi8(W, U, A, B, N) \ - (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ - (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ - (__v64qi)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ + (__v64qi)(__m512i)(W))) #define _mm512_maskz_alignr_epi8(U, A, B, N) \ - (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ - (__v64qi)(__m512i)_mm512_setzero_si512()) + (__v64qi)(__m512i)_mm512_setzero_si512())) #define _mm512_dbsad_epu8(A, B, imm) \ - (__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), (int)(imm)) + ((__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), (int)(imm))) #define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \ - (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \ - (__v32hi)(__m512i)(W)) + (__v32hi)(__m512i)(W))) #define _mm512_maskz_dbsad_epu8(U, A, B, imm) \ - (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \ - (__v32hi)_mm512_setzero_si512()) + (__v32hi)_mm512_setzero_si512())) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sad_epu8 (__m512i __A, __m512i __B) diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h index 337256c50f50d..3ba0a0cfd5fdf 100644 --- a/clang/lib/Headers/avx512dqintrin.h +++ b/clang/lib/Headers/avx512dqintrin.h @@ -121,10 +121,10 @@ _kadd_mask16(__mmask16 __A, __mmask16 __B) } #define _kshiftli_mask8(A, I) \ - (__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I)) + ((__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I))) #define _kshiftri_mask8(A, I) \ - (__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I)) + ((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I))) static __inline__ unsigned int __DEFAULT_FN_ATTRS _cvtmask8_u32(__mmask8 __A) { @@ -342,19 +342,19 @@ _mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) { } #define _mm512_cvt_roundpd_epi64(A, R) \ - (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)) + ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \ - (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \ - (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtpd_epu64 (__m512d __A) { @@ -381,19 +381,19 @@ _mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) { } #define _mm512_cvt_roundpd_epu64(A, R) \ - (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)) + ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \ - (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \ - (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtps_epi64 (__m256 __A) { @@ -420,19 +420,19 @@ _mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) { } #define _mm512_cvt_roundps_epi64(A, R) \ - (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)) + ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \ - (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvt_roundps_epi64(U, A, R) \ - (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtps_epu64 (__m256 __A) { @@ -459,19 +459,19 @@ _mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) { } #define _mm512_cvt_roundps_epu64(A, R) \ - (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)) + ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \ - (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvt_roundps_epu64(U, A, R) \ - (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -494,19 +494,19 @@ _mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) { } #define _mm512_cvt_roundepi64_pd(A, R) \ - (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \ - (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \ - (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m256 __DEFAULT_FN_ATTRS512 _mm512_cvtepi64_ps (__m512i __A) { @@ -533,19 +533,19 @@ _mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) { } #define _mm512_cvt_roundepi64_ps(A, R) \ - (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \ - (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ - (__v8sf)(__m256)(W), (__mmask8)(U), \ - (int)(R)) + ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ + (__v8sf)(__m256)(W), (__mmask8)(U), \ + (int)(R))) #define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \ - (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -573,19 +573,19 @@ _mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) { } #define _mm512_cvtt_roundpd_epi64(A, R) \ - (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)) + ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \ - (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \ - (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvttpd_epu64 (__m512d __A) { @@ -612,19 +612,19 @@ _mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) { } #define _mm512_cvtt_roundpd_epu64(A, R) \ - (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)) + ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \ - (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \ - (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvttps_epi64 (__m256 __A) { @@ -651,19 +651,19 @@ _mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) { } #define _mm512_cvtt_roundps_epi64(A, R) \ - (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)) + ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \ - (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \ - (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvttps_epu64 (__m256 __A) { @@ -690,19 +690,19 @@ _mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) { } #define _mm512_cvtt_roundps_epu64(A, R) \ - (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)) + ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \ - (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \ - (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtepu64_pd (__m512i __A) { @@ -724,20 +724,20 @@ _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) { } #define _mm512_cvt_roundepu64_pd(A, R) \ - (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \ - (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \ - (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m256 __DEFAULT_FN_ATTRS512 @@ -765,290 +765,290 @@ _mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) { } #define _mm512_cvt_roundepu64_ps(A, R) \ - (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \ - (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ - (__v8sf)(__m256)(W), (__mmask8)(U), \ - (int)(R)) + ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ + (__v8sf)(__m256)(W), (__mmask8)(U), \ + (int)(R))) #define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \ - (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U), (int)(R))) #define _mm512_range_pd(A, B, C) \ - (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_mask_range_pd(W, U, A, B, C) \ - (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)(__m512d)(W), (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_maskz_range_pd(U, A, B, C) \ - (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_range_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_range_round_pd(W, U, A, B, C, R) \ - (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)(__m512d)(W), (__mmask8)(U), \ - (int)(R)) + ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U), \ + (int)(R))) #define _mm512_maskz_range_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) #define _mm512_range_ps(A, B, C) \ - (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_mask_range_ps(W, U, A, B, C) \ - (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)(__m512)(W), (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)(__m512)(W), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_maskz_range_ps(U, A, B, C) \ - (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_range_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_range_round_ps(W, U, A, B, C, R) \ - (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)(__m512)(W), (__mmask16)(U), \ - (int)(R)) + ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)(__m512)(W), (__mmask16)(U), \ + (int)(R))) #define _mm512_maskz_range_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) #define _mm_range_round_ss(A, B, C, R) \ - (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8) -1, (int)(C),\ - (int)(R)) + ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8) -1, (int)(C),\ + (int)(R))) #define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION) #define _mm_mask_range_round_ss(W, U, A, B, C, R) \ - (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W),\ - (__mmask8)(U), (int)(C),\ - (int)(R)) + ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W),\ + (__mmask8)(U), (int)(C),\ + (int)(R))) #define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION) #define _mm_maskz_range_round_ss(U, A, B, C, R) \ - (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(C),\ - (int)(R)) + ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(C),\ + (int)(R))) #define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION) #define _mm_range_round_sd(A, B, C, R) \ - (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8) -1, (int)(C),\ - (int)(R)) + ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8) -1, (int)(C),\ + (int)(R))) #define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION) #define _mm_mask_range_round_sd(W, U, A, B, C, R) \ - (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W),\ - (__mmask8)(U), (int)(C),\ - (int)(R)) + ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W),\ + (__mmask8)(U), (int)(C),\ + (int)(R))) #define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION) #define _mm_maskz_range_round_sd(U, A, B, C, R) \ - (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(C),\ - (int)(R)) + ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(C),\ + (int)(R))) #define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION) #define _mm512_reduce_pd(A, B) \ - (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_mask_reduce_pd(W, U, A, B) \ - (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_maskz_reduce_pd(U, A, B) \ - (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_reduce_ps(A, B) \ - (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_mask_reduce_ps(W, U, A, B) \ - (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_maskz_reduce_ps(U, A, B) \ - (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_reduce_round_pd(A, B, R) \ - (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_reduce_round_pd(W, U, A, B, R) \ - (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_reduce_round_pd(U, A, B, R) \ - (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) #define _mm512_reduce_round_ps(A, B, R) \ - (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_reduce_round_ps(W, U, A, B, R) \ - (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_reduce_round_ps(U, A, B, R) \ - (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) #define _mm_reduce_ss(A, B, C) \ - (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ - (int)(C), _MM_FROUND_CUR_DIRECTION) + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ + (int)(C), _MM_FROUND_CUR_DIRECTION)) #define _mm_mask_reduce_ss(W, U, A, B, C) \ - (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(C), _MM_FROUND_CUR_DIRECTION) + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(C), _MM_FROUND_CUR_DIRECTION)) #define _mm_maskz_reduce_ss(U, A, B, C) \ - (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(C), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(C), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_reduce_round_ss(A, B, C, R) \ - (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ - (int)(C), (int)(R)) + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ + (int)(C), (int)(R))) #define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \ - (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(C), (int)(R)) + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(C), (int)(R))) #define _mm_maskz_reduce_round_ss(U, A, B, C, R) \ - (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(C), (int)(R)) + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(C), (int)(R))) #define _mm_reduce_sd(A, B, C) \ - (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(C), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(C), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_mask_reduce_sd(W, U, A, B, C) \ - (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), (__mmask8)(U), \ - (int)(C), _MM_FROUND_CUR_DIRECTION) + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), (__mmask8)(U), \ + (int)(C), _MM_FROUND_CUR_DIRECTION)) #define _mm_maskz_reduce_sd(U, A, B, C) \ - (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(C), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(C), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_reduce_round_sd(A, B, C, R) \ - (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(C), (int)(R)) + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(C), (int)(R))) #define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \ - (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), (__mmask8)(U), \ - (int)(C), (int)(R)) + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), (__mmask8)(U), \ + (int)(C), (int)(R))) #define _mm_maskz_reduce_round_sd(U, A, B, C, R) \ - (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(C), (int)(R)) + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(C), (int)(R))) static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_movepi32_mask (__m512i __A) @@ -1218,158 +1218,158 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) } #define _mm512_extractf32x8_ps(A, imm) \ - (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v8sf)_mm256_undefined_ps(), \ - (__mmask8)-1) + ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v8sf)_mm256_undefined_ps(), \ + (__mmask8)-1)) #define _mm512_mask_extractf32x8_ps(W, U, A, imm) \ - (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U)) + ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U))) #define _mm512_maskz_extractf32x8_ps(U, A, imm) \ - (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)) + ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U))) #define _mm512_extractf64x2_pd(A, imm) \ - (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ - (int)(imm), \ - (__v2df)_mm_undefined_pd(), \ - (__mmask8)-1) + ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ + (int)(imm), \ + (__v2df)_mm_undefined_pd(), \ + (__mmask8)-1)) #define _mm512_mask_extractf64x2_pd(W, U, A, imm) \ - (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ - (int)(imm), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ + (int)(imm), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) #define _mm512_maskz_extractf64x2_pd(U, A, imm) \ - (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ - (int)(imm), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ + (int)(imm), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) #define _mm512_extracti32x8_epi32(A, imm) \ - (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v8si)_mm256_undefined_si256(), \ - (__mmask8)-1) + ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v8si)_mm256_undefined_si256(), \ + (__mmask8)-1)) #define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \ - (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U)) + ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U))) #define _mm512_maskz_extracti32x8_epi32(U, A, imm) \ - (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U)) + ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U))) #define _mm512_extracti64x2_epi64(A, imm) \ - (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ + ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ (int)(imm), \ (__v2di)_mm_undefined_si128(), \ - (__mmask8)-1) + (__mmask8)-1)) #define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \ - (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ - (int)(imm), \ - (__v2di)(__m128i)(W), \ - (__mmask8)(U)) + ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ + (int)(imm), \ + (__v2di)(__m128i)(W), \ + (__mmask8)(U))) #define _mm512_maskz_extracti64x2_epi64(U, A, imm) \ - (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ - (int)(imm), \ - (__v2di)_mm_setzero_si128(), \ - (__mmask8)(U)) + ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ + (int)(imm), \ + (__v2di)_mm_setzero_si128(), \ + (__mmask8)(U))) #define _mm512_insertf32x8(A, B, imm) \ - (__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \ - (__v8sf)(__m256)(B), (int)(imm)) + ((__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \ + (__v8sf)(__m256)(B), (int)(imm))) #define _mm512_mask_insertf32x8(W, U, A, B, imm) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ - (__v16sf)(__m512)(W)) + (__v16sf)(__m512)(W))) #define _mm512_maskz_insertf32x8(U, A, B, imm) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ - (__v16sf)_mm512_setzero_ps()) + (__v16sf)_mm512_setzero_ps())) #define _mm512_insertf64x2(A, B, imm) \ - (__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \ - (__v2df)(__m128d)(B), (int)(imm)) + ((__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \ + (__v2df)(__m128d)(B), (int)(imm))) #define _mm512_mask_insertf64x2(W, U, A, B, imm) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ - (__v8df)(__m512d)(W)) + (__v8df)(__m512d)(W))) #define _mm512_maskz_insertf64x2(U, A, B, imm) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ - (__v8df)_mm512_setzero_pd()) + (__v8df)_mm512_setzero_pd())) #define _mm512_inserti32x8(A, B, imm) \ - (__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \ - (__v8si)(__m256i)(B), (int)(imm)) + ((__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \ + (__v8si)(__m256i)(B), (int)(imm))) #define _mm512_mask_inserti32x8(W, U, A, B, imm) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ - (__v16si)(__m512i)(W)) + (__v16si)(__m512i)(W))) #define _mm512_maskz_inserti32x8(U, A, B, imm) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512()) + (__v16si)_mm512_setzero_si512())) #define _mm512_inserti64x2(A, B, imm) \ - (__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \ - (__v2di)(__m128i)(B), (int)(imm)) + ((__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \ + (__v2di)(__m128i)(B), (int)(imm))) #define _mm512_mask_inserti64x2(W, U, A, B, imm) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ - (__v8di)(__m512i)(W)) + (__v8di)(__m512i)(W))) #define _mm512_maskz_inserti64x2(U, A, B, imm) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512()) + (__v8di)_mm512_setzero_si512())) #define _mm512_mask_fpclass_ps_mask(U, A, imm) \ - (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ - (int)(imm), (__mmask16)(U)) + ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ + (int)(imm), (__mmask16)(U))) #define _mm512_fpclass_ps_mask(A, imm) \ - (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ - (int)(imm), (__mmask16)-1) + ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ + (int)(imm), (__mmask16)-1)) #define _mm512_mask_fpclass_pd_mask(U, A, imm) \ - (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__mmask8)(U)) + ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__mmask8)(U))) #define _mm512_fpclass_pd_mask(A, imm) \ - (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__mmask8)-1)) #define _mm_fpclass_sd_mask(A, imm) \ - (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ + (__mmask8)-1)) #define _mm_mask_fpclass_sd_mask(U, A, imm) \ - (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ - (__mmask8)(U)) + ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ + (__mmask8)(U))) #define _mm_fpclass_ss_mask(A, imm) \ - (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__mmask8)-1)) #define _mm_mask_fpclass_ss_mask(U, A, imm) \ - (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__mmask8)(U)) + ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__mmask8)(U))) #undef __DEFAULT_FN_ATTRS512 #undef __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avx512erintrin.h b/clang/lib/Headers/avx512erintrin.h index 8570061699068..1c5a2d2d208ff 100644 --- a/clang/lib/Headers/avx512erintrin.h +++ b/clang/lib/Headers/avx512erintrin.h @@ -15,19 +15,19 @@ /* exp2a23 */ #define _mm512_exp2a23_round_pd(A, R) \ - (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_exp2a23_round_pd(S, M, A, R) \ - (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(S), (__mmask8)(M), \ - (int)(R)) + ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(S), (__mmask8)(M), \ + (int)(R))) #define _mm512_maskz_exp2a23_round_pd(M, A, R) \ - (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(M), (int)(R)) + ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(M), (int)(R))) #define _mm512_exp2a23_pd(A) \ _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION) @@ -39,19 +39,19 @@ _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) #define _mm512_exp2a23_round_ps(A, R) \ - (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_exp2a23_round_ps(S, M, A, R) \ - (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(S), (__mmask16)(M), \ - (int)(R)) + ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(S), (__mmask16)(M), \ + (int)(R))) #define _mm512_maskz_exp2a23_round_ps(M, A, R) \ - (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(M), (int)(R)) + ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(M), (int)(R))) #define _mm512_exp2a23_ps(A) \ _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION) @@ -64,19 +64,19 @@ /* rsqrt28 */ #define _mm512_rsqrt28_round_pd(A, R) \ - (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \ - (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(S), (__mmask8)(M), \ - (int)(R)) + ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(S), (__mmask8)(M), \ + (int)(R))) #define _mm512_maskz_rsqrt28_round_pd(M, A, R) \ - (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(M), (int)(R)) + ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(M), (int)(R))) #define _mm512_rsqrt28_pd(A) \ _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION) @@ -88,19 +88,19 @@ _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) #define _mm512_rsqrt28_round_ps(A, R) \ - (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \ - (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(S), (__mmask16)(M), \ - (int)(R)) + ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(S), (__mmask16)(M), \ + (int)(R))) #define _mm512_maskz_rsqrt28_round_ps(M, A, R) \ - (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(M), (int)(R)) + ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(M), (int)(R))) #define _mm512_rsqrt28_ps(A) \ _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION) @@ -112,22 +112,22 @@ _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) #define _mm_rsqrt28_round_ss(A, B, R) \ - (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \ - (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(S), \ - (__mmask8)(M), (int)(R)) + ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(S), \ + (__mmask8)(M), (int)(R))) #define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \ - (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(M), (int)(R)) + ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(M), (int)(R))) #define _mm_rsqrt28_ss(A, B) \ _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) @@ -139,22 +139,22 @@ _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) #define _mm_rsqrt28_round_sd(A, B, R) \ - (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \ - (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(S), \ - (__mmask8)(M), (int)(R)) + ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(S), \ + (__mmask8)(M), (int)(R))) #define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \ - (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(M), (int)(R)) + ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(M), (int)(R))) #define _mm_rsqrt28_sd(A, B) \ _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) @@ -167,19 +167,19 @@ /* rcp28 */ #define _mm512_rcp28_round_pd(A, R) \ - (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_rcp28_round_pd(S, M, A, R) \ - (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(S), (__mmask8)(M), \ - (int)(R)) + ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(S), (__mmask8)(M), \ + (int)(R))) #define _mm512_maskz_rcp28_round_pd(M, A, R) \ - (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(M), (int)(R)) + ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(M), (int)(R))) #define _mm512_rcp28_pd(A) \ _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION) @@ -191,19 +191,19 @@ _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) #define _mm512_rcp28_round_ps(A, R) \ - (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_rcp28_round_ps(S, M, A, R) \ - (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(S), (__mmask16)(M), \ - (int)(R)) + ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(S), (__mmask16)(M), \ + (int)(R))) #define _mm512_maskz_rcp28_round_ps(M, A, R) \ - (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(M), (int)(R)) + ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(M), (int)(R))) #define _mm512_rcp28_ps(A) \ _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION) @@ -215,22 +215,22 @@ _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) #define _mm_rcp28_round_ss(A, B, R) \ - (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_rcp28_round_ss(S, M, A, B, R) \ - (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(S), \ - (__mmask8)(M), (int)(R)) + ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(S), \ + (__mmask8)(M), (int)(R))) #define _mm_maskz_rcp28_round_ss(M, A, B, R) \ - (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(M), (int)(R)) + ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(M), (int)(R))) #define _mm_rcp28_ss(A, B) \ _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) @@ -242,22 +242,22 @@ _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) #define _mm_rcp28_round_sd(A, B, R) \ - (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_rcp28_round_sd(S, M, A, B, R) \ - (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(S), \ - (__mmask8)(M), (int)(R)) + ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(S), \ + (__mmask8)(M), (int)(R))) #define _mm_maskz_rcp28_round_sd(M, A, B, R) \ - (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(M), (int)(R)) + ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(M), (int)(R))) #define _mm_rcp28_sd(A, B) \ _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 010bcadab0195..df298640523b7 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -937,18 +937,18 @@ _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) } #define _mm512_max_round_pd(A, B, R) \ - (__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(R)) + ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R))) #define _mm512_mask_max_round_pd(W, U, A, B, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_max_round_pd((A), (B), (R)), \ - (__v8df)(W)) + (__v8df)(W))) #define _mm512_maskz_max_round_pd(U, A, B, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_max_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd()) + (__v8df)_mm512_setzero_pd())) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_max_pd(__m512d __A, __m512d __B) @@ -974,18 +974,18 @@ _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) } #define _mm512_max_round_ps(A, B, R) \ - (__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(R)) + ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R))) #define _mm512_mask_max_round_ps(W, U, A, B, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ - (__v16sf)(W)) + (__v16sf)(W))) #define _mm512_maskz_max_round_ps(U, A, B, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps()) + (__v16sf)_mm512_setzero_ps())) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_max_ps(__m512 __A, __m512 __B) @@ -1029,22 +1029,22 @@ _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) { } #define _mm_max_round_ss(A, B, R) \ - (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_max_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) #define _mm_maskz_max_round_ss(U, A, B, R) \ - (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { @@ -1065,22 +1065,22 @@ _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) { } #define _mm_max_round_sd(A, B, R) \ - (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_max_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm_maskz_max_round_sd(U, A, B, R) \ - (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -1172,18 +1172,18 @@ _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) } #define _mm512_min_round_pd(A, B, R) \ - (__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(R)) + ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R))) #define _mm512_mask_min_round_pd(W, U, A, B, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_min_round_pd((A), (B), (R)), \ - (__v8df)(W)) + (__v8df)(W))) #define _mm512_maskz_min_round_pd(U, A, B, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_min_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd()) + (__v8df)_mm512_setzero_pd())) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_min_pd(__m512d __A, __m512d __B) @@ -1209,18 +1209,18 @@ _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) } #define _mm512_min_round_ps(A, B, R) \ - (__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(R)) + ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R))) #define _mm512_mask_min_round_ps(W, U, A, B, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ - (__v16sf)(W)) + (__v16sf)(W))) #define _mm512_maskz_min_round_ps(U, A, B, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps()) + (__v16sf)_mm512_setzero_ps())) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_min_ps(__m512 __A, __m512 __B) @@ -1264,22 +1264,22 @@ _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) { } #define _mm_min_round_ss(A, B, R) \ - (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_min_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) #define _mm_maskz_min_round_ss(U, A, B, R) \ - (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { @@ -1300,22 +1300,22 @@ _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) { } #define _mm_min_round_sd(A, B, R) \ - (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_min_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm_maskz_min_round_sd(U, A, B, R) \ - (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -1485,17 +1485,17 @@ _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { } #define _mm512_sqrt_round_pd(A, R) \ - (__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)) + ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R))) #define _mm512_mask_sqrt_round_pd(W, U, A, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_sqrt_round_pd((A), (R)), \ - (__v8df)(__m512d)(W)) + (__v8df)(__m512d)(W))) #define _mm512_maskz_sqrt_round_pd(U, A, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_sqrt_round_pd((A), (R)), \ - (__v8df)_mm512_setzero_pd()) + (__v8df)_mm512_setzero_pd())) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_sqrt_pd(__m512d __A) @@ -1521,17 +1521,17 @@ _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) } #define _mm512_sqrt_round_ps(A, R) \ - (__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)) + ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R))) #define _mm512_mask_sqrt_round_ps(W, U, A, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ - (__v16sf)(__m512)(W)) + (__v16sf)(__m512)(W))) #define _mm512_maskz_sqrt_round_ps(U, A, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ - (__v16sf)_mm512_setzero_ps()) + (__v16sf)_mm512_setzero_ps())) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_sqrt_ps(__m512 __A) @@ -1900,22 +1900,22 @@ _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { } #define _mm_add_round_ss(A, B, R) \ - (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_add_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) #define _mm_maskz_add_round_ss(U, A, B, R) \ - (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { @@ -1929,22 +1929,22 @@ _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } #define _mm_add_round_sd(A, B, R) \ - (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_add_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm_maskz_add_round_sd(U, A, B, R) \ - (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { @@ -1975,32 +1975,32 @@ _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { } #define _mm512_add_round_pd(A, B, R) \ - (__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(R)) + ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R))) #define _mm512_mask_add_round_pd(W, U, A, B, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_add_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W)) + (__v8df)(__m512d)(W))) #define _mm512_maskz_add_round_pd(U, A, B, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_add_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd()) + (__v8df)_mm512_setzero_pd())) #define _mm512_add_round_ps(A, B, R) \ - (__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(R)) + ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R))) #define _mm512_mask_add_round_ps(W, U, A, B, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W)) + (__v16sf)(__m512)(W))) #define _mm512_maskz_add_round_ps(U, A, B, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps()) + (__v16sf)_mm512_setzero_ps())) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { @@ -2014,22 +2014,22 @@ _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } #define _mm_sub_round_ss(A, B, R) \ - (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_sub_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) #define _mm_maskz_sub_round_ss(U, A, B, R) \ - (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { @@ -2044,22 +2044,22 @@ _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { } #define _mm_sub_round_sd(A, B, R) \ - (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_sub_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm_maskz_sub_round_sd(U, A, B, R) \ - (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { @@ -2090,32 +2090,32 @@ _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { } #define _mm512_sub_round_pd(A, B, R) \ - (__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(R)) + ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R))) #define _mm512_mask_sub_round_pd(W, U, A, B, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W)) + (__v8df)(__m512d)(W))) #define _mm512_maskz_sub_round_pd(U, A, B, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd()) + (__v8df)_mm512_setzero_pd())) #define _mm512_sub_round_ps(A, B, R) \ - (__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(R)) + ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R))) #define _mm512_mask_sub_round_ps(W, U, A, B, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W)) + (__v16sf)(__m512)(W))) #define _mm512_maskz_sub_round_ps(U, A, B, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps()) + (__v16sf)_mm512_setzero_ps())) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { @@ -2129,22 +2129,22 @@ _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } #define _mm_mul_round_ss(A, B, R) \ - (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_mul_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) #define _mm_maskz_mul_round_ss(U, A, B, R) \ - (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { @@ -2159,22 +2159,22 @@ _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { } #define _mm_mul_round_sd(A, B, R) \ - (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_mul_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm_maskz_mul_round_sd(U, A, B, R) \ - (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { @@ -2205,32 +2205,32 @@ _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { } #define _mm512_mul_round_pd(A, B, R) \ - (__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(R)) + ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R))) #define _mm512_mask_mul_round_pd(W, U, A, B, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W)) + (__v8df)(__m512d)(W))) #define _mm512_maskz_mul_round_pd(U, A, B, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd()) + (__v8df)_mm512_setzero_pd())) #define _mm512_mul_round_ps(A, B, R) \ - (__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(R)) + ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R))) #define _mm512_mask_mul_round_ps(W, U, A, B, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W)) + (__v16sf)(__m512)(W))) #define _mm512_maskz_mul_round_ps(U, A, B, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps()) + (__v16sf)_mm512_setzero_ps())) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { @@ -2245,22 +2245,22 @@ _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { } #define _mm_div_round_ss(A, B, R) \ - (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_div_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) #define _mm_maskz_div_round_ss(U, A, B, R) \ - (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { @@ -2275,22 +2275,22 @@ _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { } #define _mm_div_round_sd(A, B, R) \ - (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_div_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm_maskz_div_round_sd(U, A, B, R) \ - (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_div_pd(__m512d __a, __m512d __b) @@ -2333,179 +2333,179 @@ _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { } #define _mm512_div_round_pd(A, B, R) \ - (__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(R)) + ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R))) #define _mm512_mask_div_round_pd(W, U, A, B, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_div_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W)) + (__v8df)(__m512d)(W))) #define _mm512_maskz_div_round_pd(U, A, B, R) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_div_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd()) + (__v8df)_mm512_setzero_pd())) #define _mm512_div_round_ps(A, B, R) \ - (__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(R)) + ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R))) #define _mm512_mask_div_round_ps(W, U, A, B, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W)) + (__v16sf)(__m512)(W))) #define _mm512_maskz_div_round_ps(U, A, B, R) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps()) + (__v16sf)_mm512_setzero_ps())) #define _mm512_roundscale_ps(A, B) \ - (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_mask_roundscale_ps(A, B, C, imm) \ - (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ (__v16sf)(__m512)(A), (__mmask16)(B), \ - _MM_FROUND_CUR_DIRECTION) + _MM_FROUND_CUR_DIRECTION)) #define _mm512_maskz_roundscale_ps(A, B, imm) \ - (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(A), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(A), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \ - (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ (__v16sf)(__m512)(A), (__mmask16)(B), \ - (int)(R)) + (int)(R))) #define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \ - (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(A), (int)(R)) + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(A), (int)(R))) #define _mm512_roundscale_round_ps(A, imm, R) \ - (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, (int)(R))) #define _mm512_roundscale_pd(A, B) \ - (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_mask_roundscale_pd(A, B, C, imm) \ - (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ (__v8df)(__m512d)(A), (__mmask8)(B), \ - _MM_FROUND_CUR_DIRECTION) + _MM_FROUND_CUR_DIRECTION)) #define _mm512_maskz_roundscale_pd(A, B, imm) \ - (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(A), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(A), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \ - (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ (__v8df)(__m512d)(A), (__mmask8)(B), \ - (int)(R)) + (int)(R))) #define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \ - (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(A), (int)(R)) + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(A), (int)(R))) #define _mm512_roundscale_round_pd(A, imm, R) \ - (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm512_fmadd_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) #define _mm512_fmsub_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) #define _mm512_fnmadd_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) #define _mm512_fnmsub_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R))) #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -2629,87 +2629,87 @@ _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) } #define _mm512_fmadd_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ - (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_fmsub_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_fnmadd_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - -(__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ - (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_fnmsub_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - -(__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R))) #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -2833,52 +2833,52 @@ _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) } #define _mm512_fmaddsub_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ - (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) #define _mm512_fmsubadd_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -2952,52 +2952,52 @@ _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) } #define _mm512_fmaddsub_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ - (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ - (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_fmsubadd_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ - (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -3071,10 +3071,10 @@ _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) } #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ - (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -3088,10 +3088,10 @@ _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) } #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ - (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) @@ -3104,10 +3104,10 @@ _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) } #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ - (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -3121,10 +3121,10 @@ _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) } #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ - (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -3138,10 +3138,10 @@ _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) } #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - -(__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + -(__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -3155,10 +3155,10 @@ _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) } #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - -(__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -3172,17 +3172,17 @@ _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) } #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ - (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - -(__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + -(__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ - (__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 @@ -3206,17 +3206,17 @@ _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) } #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ - (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - -(__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ - (__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 @@ -3312,63 +3312,63 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, } #define _mm512_alignr_epi64(A, B, I) \ - (__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(I)) + ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(I))) #define _mm512_mask_alignr_epi64(W, U, A, B, imm) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ - (__v8di)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ + (__v8di)(__m512i)(W))) #define _mm512_maskz_alignr_epi64(U, A, B, imm) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512())) #define _mm512_alignr_epi32(A, B, I) \ - (__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(I)) + ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (int)(I))) #define _mm512_mask_alignr_epi32(W, U, A, B, imm) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ - (__v16si)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ + (__v16si)(__m512i)(W))) #define _mm512_maskz_alignr_epi32(U, A, B, imm) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512())) /* Vector Extract */ #define _mm512_extractf64x4_pd(A, I) \ - (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ - (__v4df)_mm256_undefined_pd(), \ - (__mmask8)-1) + ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ + (__v4df)_mm256_undefined_pd(), \ + (__mmask8)-1)) #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \ - (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U)) + ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) #define _mm512_maskz_extractf64x4_pd(U, A, imm) \ - (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)) + ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U))) #define _mm512_extractf32x4_ps(A, I) \ - (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v4sf)_mm_undefined_ps(), \ - (__mmask8)-1) + ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ + (__v4sf)_mm_undefined_ps(), \ + (__mmask8)-1)) #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \ - (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U)) + ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) #define _mm512_maskz_extractf32x4_ps(U, A, imm) \ - (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)) + ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U))) /* Vector Blend */ @@ -3407,14 +3407,14 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) /* Compare */ #define _mm512_cmp_round_ps_mask(A, B, P, R) \ - (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(P), \ - (__mmask16)-1, (int)(R)) + ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(P), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \ - (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(P), \ - (__mmask16)(U), (int)(R)) + ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(P), \ + (__mmask16)(U), (int)(R))) #define _mm512_cmp_ps_mask(A, B, P) \ _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) @@ -3462,14 +3462,14 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q) #define _mm512_cmp_round_pd_mask(A, B, P, R) \ - (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(P), \ - (__mmask8)-1, (int)(R)) + ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(P), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \ - (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(P), \ - (__mmask8)(U), (int)(R)) + ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(P), \ + (__mmask8)(U), (int)(R))) #define _mm512_cmp_pd_mask(A, B, P) \ _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) @@ -3519,19 +3519,19 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) /* Conversion */ #define _mm512_cvtt_roundps_epu32(A, R) \ - (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_undefined_epi32(), \ - (__mmask16)-1, (int)(R)) + ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_undefined_epi32(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \ - (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ + (__v16si)(__m512i)(W), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \ - (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(U), (int)(R))) static __inline __m512i __DEFAULT_FN_ATTRS512 @@ -3563,34 +3563,34 @@ _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) } #define _mm512_cvt_roundepi32_ps(A, R) \ - (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \ - (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \ - (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) #define _mm512_cvt_roundepu32_ps(A, R) \ - (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \ - (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \ - (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtepu32_ps (__m512i __A) @@ -3705,19 +3705,19 @@ _mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) } #define _mm512_cvt_roundpd_ps(A, R) \ - (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \ - (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ - (__v8sf)(__m256)(W), (__mmask8)(U), \ - (int)(R)) + ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ + (__v8sf)(__m256)(W), (__mmask8)(U), \ + (int)(R))) #define _mm512_maskz_cvt_roundpd_ps(U, A, R) \ - (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m256 __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ps (__m512d __A) @@ -3765,38 +3765,38 @@ _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) } #define _mm512_cvt_roundps_ph(A, I) \ - (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v16hi)_mm256_undefined_si256(), \ - (__mmask16)-1) + ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ + (__v16hi)_mm256_undefined_si256(), \ + (__mmask16)-1)) #define _mm512_mask_cvt_roundps_ph(U, W, A, I) \ - (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v16hi)(__m256i)(U), \ - (__mmask16)(W)) + ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ + (__v16hi)(__m256i)(U), \ + (__mmask16)(W))) #define _mm512_maskz_cvt_roundps_ph(W, A, I) \ - (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(W)) + ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ + (__v16hi)_mm256_setzero_si256(), \ + (__mmask16)(W))) #define _mm512_cvtps_ph _mm512_cvt_roundps_ph #define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph #define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph #define _mm512_cvt_roundph_ps(A, R) \ - (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_cvt_roundph_ps(W, U, A, R) \ - (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_cvt_roundph_ps(U, A, R) \ - (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) static __inline __m512 __DEFAULT_FN_ATTRS512 @@ -3828,19 +3828,19 @@ _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) } #define _mm512_cvtt_roundpd_epi32(A, R) \ - (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1, (int)(R)) + ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \ - (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U), (int)(R)) + ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \ - (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U), (int)(R)) + ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U), (int)(R))) static __inline __m256i __DEFAULT_FN_ATTRS512 _mm512_cvttpd_epi32(__m512d __a) @@ -3870,19 +3870,19 @@ _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) } #define _mm512_cvtt_roundps_epi32(A, R) \ - (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1, (int)(R)) + ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \ - (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ + (__v16si)(__m512i)(W), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \ - (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(U), (int)(R))) static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_cvttps_epi32(__m512 __a) @@ -3912,19 +3912,19 @@ _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) } #define _mm512_cvt_roundps_epi32(A, R) \ - (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1, (int)(R)) + ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \ - (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ + (__v16si)(__m512i)(W), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_cvt_roundps_epi32(U, A, R) \ - (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(U), (int)(R))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtps_epi32 (__m512 __A) @@ -3955,19 +3955,19 @@ _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) } #define _mm512_cvt_roundpd_epi32(A, R) \ - (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1, (int)(R)) + ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \ - (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U), (int)(R)) + ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \ - (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U), (int)(R)) + ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtpd_epi32 (__m512d __A) @@ -3999,19 +3999,19 @@ _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) } #define _mm512_cvt_roundps_epu32(A, R) \ - (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1, (int)(R)) + ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \ - (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ + (__v16si)(__m512i)(W), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_cvt_roundps_epu32(U, A, R) \ - (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U), (int)(R)) + ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(U), (int)(R))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtps_epu32 ( __m512 __A) @@ -4043,19 +4043,19 @@ _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A) } #define _mm512_cvt_roundpd_epu32(A, R) \ - (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1, (int)(R)) + ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \ - (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U), (int)(R)) + ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \ - (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U), (int)(R)) + ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtpd_epu32 (__m512d __A) @@ -4975,70 +4975,70 @@ _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) #define _mm512_cmp_epi32_mask(a, b, p) \ - (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ - (__v16si)(__m512i)(b), (int)(p), \ - (__mmask16)-1) + ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ + (__v16si)(__m512i)(b), (int)(p), \ + (__mmask16)-1)) #define _mm512_cmp_epu32_mask(a, b, p) \ - (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ - (__v16si)(__m512i)(b), (int)(p), \ - (__mmask16)-1) + ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ + (__v16si)(__m512i)(b), (int)(p), \ + (__mmask16)-1)) #define _mm512_cmp_epi64_mask(a, b, p) \ - (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ - (__v8di)(__m512i)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ + (__v8di)(__m512i)(b), (int)(p), \ + (__mmask8)-1)) #define _mm512_cmp_epu64_mask(a, b, p) \ - (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ - (__v8di)(__m512i)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ + (__v8di)(__m512i)(b), (int)(p), \ + (__mmask8)-1)) #define _mm512_mask_cmp_epi32_mask(m, a, b, p) \ - (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ - (__v16si)(__m512i)(b), (int)(p), \ - (__mmask16)(m)) + ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ + (__v16si)(__m512i)(b), (int)(p), \ + (__mmask16)(m))) #define _mm512_mask_cmp_epu32_mask(m, a, b, p) \ - (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ - (__v16si)(__m512i)(b), (int)(p), \ - (__mmask16)(m)) + ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ + (__v16si)(__m512i)(b), (int)(p), \ + (__mmask16)(m))) #define _mm512_mask_cmp_epi64_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ - (__v8di)(__m512i)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ + (__v8di)(__m512i)(b), (int)(p), \ + (__mmask8)(m))) #define _mm512_mask_cmp_epu64_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ - (__v8di)(__m512i)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ + (__v8di)(__m512i)(b), (int)(p), \ + (__mmask8)(m))) #define _mm512_rol_epi32(a, b) \ - (__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)) + ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b))) #define _mm512_mask_rol_epi32(W, U, a, b) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_rol_epi32((a), (b)), \ - (__v16si)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_rol_epi32((a), (b)), \ + (__v16si)(__m512i)(W))) #define _mm512_maskz_rol_epi32(U, a, b) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_rol_epi32((a), (b)), \ - (__v16si)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_rol_epi32((a), (b)), \ + (__v16si)_mm512_setzero_si512())) #define _mm512_rol_epi64(a, b) \ - (__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)) + ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b))) #define _mm512_mask_rol_epi64(W, U, a, b) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_rol_epi64((a), (b)), \ - (__v8di)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_rol_epi64((a), (b)), \ + (__v8di)(__m512i)(W))) #define _mm512_maskz_rol_epi64(U, a, b) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_rol_epi64((a), (b)), \ - (__v8di)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_rol_epi64((a), (b)), \ + (__v8di)_mm512_setzero_si512())) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_rolv_epi32 (__m512i __A, __m512i __B) @@ -5085,30 +5085,30 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) } #define _mm512_ror_epi32(A, B) \ - (__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)) + ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B))) #define _mm512_mask_ror_epi32(W, U, A, B) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_ror_epi32((A), (B)), \ - (__v16si)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_ror_epi32((A), (B)), \ + (__v16si)(__m512i)(W))) #define _mm512_maskz_ror_epi32(U, A, B) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_ror_epi32((A), (B)), \ - (__v16si)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_ror_epi32((A), (B)), \ + (__v16si)_mm512_setzero_si512())) #define _mm512_ror_epi64(A, B) \ - (__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)) + ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B))) #define _mm512_mask_ror_epi64(W, U, A, B) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_ror_epi64((A), (B)), \ - (__v8di)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_ror_epi64((A), (B)), \ + (__v8di)(__m512i)(W))) #define _mm512_maskz_ror_epi64(U, A, B) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_ror_epi64((A), (B)), \ - (__v8di)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_ror_epi64((A), (B)), \ + (__v8di)_mm512_setzero_si512())) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_slli_epi32(__m512i __A, unsigned int __B) @@ -5304,168 +5304,168 @@ _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) } #define _mm512_fixupimm_round_pd(A, B, C, imm, R) \ - (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \ - (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)(U), (int)(R))) #define _mm512_fixupimm_pd(A, B, C, imm) \ - (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \ - (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \ - (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8di)(__m512i)(C), \ - (int)(imm), (__mmask8)(U), \ - (int)(R)) + ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8di)(__m512i)(C), \ + (int)(imm), (__mmask8)(U), \ + (int)(R))) #define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \ - (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8di)(__m512i)(C), \ - (int)(imm), (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8di)(__m512i)(C), \ + (int)(imm), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_fixupimm_round_ps(A, B, C, imm, R) \ - (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16si)(__m512i)(C), (int)(imm), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \ - (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16si)(__m512i)(C), (int)(imm), \ + (__mmask16)(U), (int)(R))) #define _mm512_fixupimm_ps(A, B, C, imm) \ - (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16si)(__m512i)(C), (int)(imm), \ + (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \ - (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16si)(__m512i)(C), (int)(imm), \ + (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \ - (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16si)(__m512i)(C), \ - (int)(imm), (__mmask16)(U), \ - (int)(R)) + ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16si)(__m512i)(C), \ + (int)(imm), (__mmask16)(U), \ + (int)(R))) #define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \ - (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16si)(__m512i)(C), \ - (int)(imm), (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16si)(__m512i)(C), \ + (int)(imm), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_fixupimm_round_sd(A, B, C, imm, R) \ - (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \ - (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), (int)(R))) #define _mm_fixupimm_sd(A, B, C, imm) \ - (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION) - -#define _mm_mask_fixupimm_sd(A, U, B, C, imm) \ - (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \ - (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ + ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R)) + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) -#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \ - (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ +#define _mm_mask_fixupimm_sd(A, U, B, C, imm) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \ + ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_fixupimm_round_ss(A, B, C, imm, R) \ - (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \ - (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), (int)(R))) #define _mm_fixupimm_ss(A, B, C, imm) \ - (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION) - -#define _mm_mask_fixupimm_ss(A, U, B, C, imm) \ - (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \ - (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ + ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R)) + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) -#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \ - (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ +#define _mm_mask_fixupimm_ss(A, U, B, C, imm) \ + ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \ + ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \ + ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_getexp_round_sd(A, B, R) \ - (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 @@ -5486,10 +5486,10 @@ _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) } #define _mm_mask_getexp_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) @@ -5502,16 +5502,16 @@ _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) } #define _mm_maskz_getexp_round_sd(U, A, B, R) \ - (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) #define _mm_getexp_round_ss(A, B, R) \ - (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_getexp_ss (__m128 __A, __m128 __B) @@ -5531,10 +5531,10 @@ _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) } #define _mm_mask_getexp_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) @@ -5547,100 +5547,100 @@ _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) } #define _mm_maskz_getexp_round_ss(U, A, B, R) \ - (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) #define _mm_getmant_round_sd(A, B, C, D, R) \ - (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(((D)<<2) | (C)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (int)(((D)<<2) | (C)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm_getmant_sd(A, B, C, D) \ - (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(((D)<<2) | (C)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (int)(((D)<<2) | (C)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_mask_getmant_sd(W, U, A, B, C, D) \ - (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(((D)<<2) | (C)), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (int)(((D)<<2) | (C)), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \ - (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(((D)<<2) | (C)), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (int)(((D)<<2) | (C)), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm_maskz_getmant_sd(U, A, B, C, D) \ - (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(((D)<<2) | (C)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (int)(((D)<<2) | (C)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \ - (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(((D)<<2) | (C)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (int)(((D)<<2) | (C)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) -#define _mm_getmant_round_ss(A, B, C, D, R) \ - (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)) +#define _mm_getmant_round_ss(A, B, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm_getmant_ss(A, B, C, D) \ - (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_mask_getmant_ss(W, U, A, B, C, D) \ - (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \ - (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(R))) #define _mm_maskz_getmant_ss(U, A, B, C, D) \ - (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \ - (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kmov (__mmask16 __A) @@ -5649,16 +5649,16 @@ _mm512_kmov (__mmask16 __A) } #define _mm_comi_round_sd(A, B, P, R) \ - (int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \ - (int)(P), (int)(R)) + ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \ + (int)(P), (int)(R))) #define _mm_comi_round_ss(A, B, P, R) \ - (int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ - (int)(P), (int)(R)) + ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ + (int)(P), (int)(R))) #ifdef __x86_64__ #define _mm_cvt_roundsd_si64(A, R) \ - (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)) + ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))) #endif static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -5926,54 +5926,54 @@ _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) } #define _mm512_ternarylogic_epi32(A, B, C, imm) \ - (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), \ - (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)-1) + ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), \ + (__v16si)(__m512i)(C), (int)(imm), \ + (__mmask16)-1)) #define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \ - (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), \ - (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)(U)) + ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), \ + (__v16si)(__m512i)(C), (int)(imm), \ + (__mmask16)(U))) #define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \ - (__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), \ - (__v16si)(__m512i)(C), \ - (int)(imm), (__mmask16)(U)) + ((__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), \ + (__v16si)(__m512i)(C), \ + (int)(imm), (__mmask16)(U))) #define _mm512_ternarylogic_epi64(A, B, C, imm) \ - (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)-1) + ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)-1)) #define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \ - (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)(U)) - -#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \ - (__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \ + ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ (__v8di)(__m512i)(B), \ (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)(U)) + (__mmask8)(U))) + +#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \ + ((__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)(U))) #ifdef __x86_64__ #define _mm_cvt_roundsd_i64(A, R) \ - (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)) + ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))) #endif #define _mm_cvt_roundsd_si32(A, R) \ - (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)) + ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))) #define _mm_cvt_roundsd_i32(A, R) \ - (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)) + ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))) #define _mm_cvt_roundsd_u32(A, R) \ - (unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)) + ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R))) static __inline__ unsigned __DEFAULT_FN_ATTRS128 _mm_cvtsd_u32 (__m128d __A) @@ -5984,8 +5984,8 @@ _mm_cvtsd_u32 (__m128d __A) #ifdef __x86_64__ #define _mm_cvt_roundsd_u64(A, R) \ - (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \ - (int)(R)) + ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \ + (int)(R))) static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 _mm_cvtsd_u64 (__m128d __A) @@ -5997,21 +5997,21 @@ _mm_cvtsd_u64 (__m128d __A) #endif #define _mm_cvt_roundss_si32(A, R) \ - (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)) + ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))) #define _mm_cvt_roundss_i32(A, R) \ - (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)) + ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))) #ifdef __x86_64__ #define _mm_cvt_roundss_si64(A, R) \ - (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)) + ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))) #define _mm_cvt_roundss_i64(A, R) \ - (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)) + ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))) #endif #define _mm_cvt_roundss_u32(A, R) \ - (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)) + ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R))) static __inline__ unsigned __DEFAULT_FN_ATTRS128 _mm_cvtss_u32 (__m128 __A) @@ -6022,8 +6022,8 @@ _mm_cvtss_u32 (__m128 __A) #ifdef __x86_64__ #define _mm_cvt_roundss_u64(A, R) \ - (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \ - (int)(R)) + ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \ + (int)(R))) static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 _mm_cvtss_u64 (__m128 __A) @@ -6035,10 +6035,10 @@ _mm_cvtss_u64 (__m128 __A) #endif #define _mm_cvtt_roundsd_i32(A, R) \ - (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)) + ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))) #define _mm_cvtt_roundsd_si32(A, R) \ - (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)) + ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))) static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsd_i32 (__m128d __A) @@ -6049,10 +6049,10 @@ _mm_cvttsd_i32 (__m128d __A) #ifdef __x86_64__ #define _mm_cvtt_roundsd_si64(A, R) \ - (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)) + ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))) #define _mm_cvtt_roundsd_i64(A, R) \ - (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)) + ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))) static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsd_i64 (__m128d __A) @@ -6063,7 +6063,7 @@ _mm_cvttsd_i64 (__m128d __A) #endif #define _mm_cvtt_roundsd_u32(A, R) \ - (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)) + ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R))) static __inline__ unsigned __DEFAULT_FN_ATTRS128 _mm_cvttsd_u32 (__m128d __A) @@ -6074,8 +6074,8 @@ _mm_cvttsd_u32 (__m128d __A) #ifdef __x86_64__ #define _mm_cvtt_roundsd_u64(A, R) \ - (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \ - (int)(R)) + ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \ + (int)(R))) static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 _mm_cvttsd_u64 (__m128d __A) @@ -6087,10 +6087,10 @@ _mm_cvttsd_u64 (__m128d __A) #endif #define _mm_cvtt_roundss_i32(A, R) \ - (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)) + ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))) #define _mm_cvtt_roundss_si32(A, R) \ - (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)) + ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))) static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttss_i32 (__m128 __A) @@ -6101,10 +6101,10 @@ _mm_cvttss_i32 (__m128 __A) #ifdef __x86_64__ #define _mm_cvtt_roundss_i64(A, R) \ - (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)) + ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))) #define _mm_cvtt_roundss_si64(A, R) \ - (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)) + ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))) static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttss_i64 (__m128 __A) @@ -6115,7 +6115,7 @@ _mm_cvttss_i64 (__m128 __A) #endif #define _mm_cvtt_roundss_u32(A, R) \ - (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)) + ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R))) static __inline__ unsigned __DEFAULT_FN_ATTRS128 _mm_cvttss_u32 (__m128 __A) @@ -6126,8 +6126,8 @@ _mm_cvttss_u32 (__m128 __A) #ifdef __x86_64__ #define _mm_cvtt_roundss_u64(A, R) \ - (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \ - (int)(R)) + ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \ + (int)(R))) static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 _mm_cvttss_u64 (__m128 __A) @@ -6139,30 +6139,30 @@ _mm_cvttss_u64 (__m128 __A) #endif #define _mm512_permute_pd(X, C) \ - (__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)) + ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C))) #define _mm512_mask_permute_pd(W, U, X, C) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_permute_pd((X), (C)), \ - (__v8df)(__m512d)(W)) + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_permute_pd((X), (C)), \ + (__v8df)(__m512d)(W))) #define _mm512_maskz_permute_pd(U, X, C) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_permute_pd((X), (C)), \ - (__v8df)_mm512_setzero_pd()) + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_permute_pd((X), (C)), \ + (__v8df)_mm512_setzero_pd())) #define _mm512_permute_ps(X, C) \ - (__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)) + ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C))) #define _mm512_mask_permute_ps(W, U, X, C) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_permute_ps((X), (C)), \ - (__v16sf)(__m512)(W)) + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_permute_ps((X), (C)), \ + (__v16sf)(__m512)(W))) #define _mm512_maskz_permute_ps(U, X, C) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_permute_ps((X), (C)), \ - (__v16sf)_mm512_setzero_ps()) + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_permute_ps((X), (C)), \ + (__v16sf)_mm512_setzero_ps())) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_permutevar_pd(__m512d __A, __m512i __C) @@ -6274,19 +6274,19 @@ _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) #define _mm512_cvtt_roundpd_epu32(A, R) \ - (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_undefined_si256(), \ - (__mmask8)-1, (int)(R)) + ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_undefined_si256(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \ - (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U), (int)(R)) + ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \ - (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U), (int)(R)) + ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvttpd_epu32 (__m512d __A) @@ -6318,106 +6318,106 @@ _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) } #define _mm_roundscale_round_sd(A, B, imm, R) \ - (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(imm), \ - (int)(R)) + ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(imm), \ + (int)(R))) #define _mm_roundscale_sd(A, B, imm) \ - (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(imm), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(imm), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_mask_roundscale_sd(W, U, A, B, imm) \ - (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(imm), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(imm), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \ - (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(I), \ - (int)(R)) + ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(I), \ + (int)(R))) #define _mm_maskz_roundscale_sd(U, A, B, I) \ - (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(I), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(I), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \ - (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(I), \ - (int)(R)) + ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(I), \ + (int)(R))) #define _mm_roundscale_round_ss(A, B, imm, R) \ - (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(imm), \ - (int)(R)) + ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(imm), \ + (int)(R))) #define _mm_roundscale_ss(A, B, imm) \ - (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(imm), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(imm), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_mask_roundscale_ss(W, U, A, B, I) \ - (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(I), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(I), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \ - (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(I), \ - (int)(R)) + ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(I), \ + (int)(R))) #define _mm_maskz_roundscale_ss(U, A, B, I) \ - (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(I), \ - _MM_FROUND_CUR_DIRECTION) + ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(I), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \ - (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(I), \ - (int)(R)) + ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(I), \ + (int)(R))) #define _mm512_scalef_round_pd(A, B, R) \ - (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_scalef_round_pd(W, U, A, B, R) \ - (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_scalef_round_pd(U, A, B, R) \ - (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_scalef_pd (__m512d __A, __m512d __B) @@ -6452,22 +6452,22 @@ _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) } #define _mm512_scalef_round_ps(A, B, R) \ - (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_scalef_round_ps(W, U, A, B, R) \ - (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_scalef_round_ps(U, A, B, R) \ - (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_scalef_ps (__m512 __A, __m512 __B) @@ -6502,10 +6502,10 @@ _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) } #define _mm_scalef_round_sd(A, B, R) \ - (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_scalef_sd (__m128d __A, __m128d __B) @@ -6527,10 +6527,10 @@ _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) } #define _mm_mask_scalef_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B) @@ -6543,16 +6543,16 @@ _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B) } #define _mm_maskz_scalef_round_sd(U, A, B, R) \ - (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) #define _mm_scalef_round_ss(A, B, R) \ - (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_scalef_ss (__m128 __A, __m128 __B) @@ -6574,10 +6574,10 @@ _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) } #define _mm_mask_scalef_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) @@ -6590,11 +6590,11 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) } #define _mm_maskz_scalef_round_ss(U, A, B, R) \ - (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), \ + (int)(R))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srai_epi32(__m512i __A, unsigned int __B) @@ -6642,94 +6642,94 @@ _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B) } #define _mm512_shuffle_f32x4(A, B, imm) \ - (__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(imm)) + ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(imm))) #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ - (__v16sf)(__m512)(W)) + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ + (__v16sf)(__m512)(W))) #define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ - (__v16sf)_mm512_setzero_ps()) + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps())) #define _mm512_shuffle_f64x2(A, B, imm) \ - (__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(imm)) + ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(imm))) #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ - (__v8df)(__m512d)(W)) + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ + (__v8df)(__m512d)(W))) #define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ - (__v8df)_mm512_setzero_pd()) + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd())) #define _mm512_shuffle_i32x4(A, B, imm) \ - (__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(imm)) + ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (int)(imm))) #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ - (__v16si)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ + (__v16si)(__m512i)(W))) #define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512())) #define _mm512_shuffle_i64x2(A, B, imm) \ - (__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(imm)) + ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(imm))) #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ - (__v8di)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ + (__v8di)(__m512i)(W))) #define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512())) #define _mm512_shuffle_pd(A, B, M) \ - (__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(M)) + ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(M))) #define _mm512_mask_shuffle_pd(W, U, A, B, M) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ - (__v8df)(__m512d)(W)) + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ + (__v8df)(__m512d)(W))) #define _mm512_maskz_shuffle_pd(U, A, B, M) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ - (__v8df)_mm512_setzero_pd()) + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ + (__v8df)_mm512_setzero_pd())) #define _mm512_shuffle_ps(A, B, M) \ - (__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(M)) + ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(M))) #define _mm512_mask_shuffle_ps(W, U, A, B, M) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ - (__v16sf)(__m512)(W)) + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ + (__v16sf)(__m512)(W))) #define _mm512_maskz_shuffle_ps(U, A, B, M) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ - (__v16sf)_mm512_setzero_ps()) + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ + (__v16sf)_mm512_setzero_ps())) #define _mm_sqrt_round_sd(A, B, R) \ - (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) @@ -6742,10 +6742,10 @@ _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) } #define _mm_mask_sqrt_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B) @@ -6758,16 +6758,16 @@ _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B) } #define _mm_maskz_sqrt_round_sd(U, A, B, R) \ - (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) #define _mm_sqrt_round_ss(A, B, R) \ - (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) @@ -6780,10 +6780,10 @@ _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) } #define _mm_mask_sqrt_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) @@ -6796,10 +6796,10 @@ _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) } #define _mm_maskz_sqrt_round_ss(U, A, B, R) \ - (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_broadcast_f32x4(__m128 __A) @@ -7366,183 +7366,183 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) } #define _mm512_extracti32x4_epi32(A, imm) \ - (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)_mm_undefined_si128(), \ - (__mmask8)-1) + ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v4si)_mm_undefined_si128(), \ + (__mmask8)-1)) #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \ - (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)(__m128i)(W), \ - (__mmask8)(U)) + ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v4si)(__m128i)(W), \ + (__mmask8)(U))) #define _mm512_maskz_extracti32x4_epi32(U, A, imm) \ - (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U)) + ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v4si)_mm_setzero_si128(), \ + (__mmask8)(U))) #define _mm512_extracti64x4_epi64(A, imm) \ - (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)_mm256_undefined_si256(), \ - (__mmask8)-1) + ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ + (__v4di)_mm256_undefined_si256(), \ + (__mmask8)-1)) #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \ - (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)(__m256i)(W), \ - (__mmask8)(U)) + ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ + (__v4di)(__m256i)(W), \ + (__mmask8)(U))) #define _mm512_maskz_extracti64x4_epi64(U, A, imm) \ - (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U)) + ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ + (__v4di)_mm256_setzero_si256(), \ + (__mmask8)(U))) #define _mm512_insertf64x4(A, B, imm) \ - (__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \ - (__v4df)(__m256d)(B), (int)(imm)) + ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \ + (__v4df)(__m256d)(B), (int)(imm))) #define _mm512_mask_insertf64x4(W, U, A, B, imm) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ - (__v8df)(__m512d)(W)) + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ + (__v8df)(__m512d)(W))) #define _mm512_maskz_insertf64x4(U, A, B, imm) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ - (__v8df)_mm512_setzero_pd()) + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd())) #define _mm512_inserti64x4(A, B, imm) \ - (__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \ - (__v4di)(__m256i)(B), (int)(imm)) + ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \ + (__v4di)(__m256i)(B), (int)(imm))) #define _mm512_mask_inserti64x4(W, U, A, B, imm) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ - (__v8di)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ + (__v8di)(__m512i)(W))) #define _mm512_maskz_inserti64x4(U, A, B, imm) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512())) #define _mm512_insertf32x4(A, B, imm) \ - (__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \ - (__v4sf)(__m128)(B), (int)(imm)) + ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \ + (__v4sf)(__m128)(B), (int)(imm))) #define _mm512_mask_insertf32x4(W, U, A, B, imm) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ - (__v16sf)(__m512)(W)) + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ + (__v16sf)(__m512)(W))) #define _mm512_maskz_insertf32x4(U, A, B, imm) \ - (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ - (__v16sf)_mm512_setzero_ps()) + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps())) #define _mm512_inserti32x4(A, B, imm) \ - (__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \ - (__v4si)(__m128i)(B), (int)(imm)) + ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \ + (__v4si)(__m128i)(B), (int)(imm))) #define _mm512_mask_inserti32x4(W, U, A, B, imm) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ - (__v16si)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ + (__v16si)(__m512i)(W))) #define _mm512_maskz_inserti32x4(U, A, B, imm) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512())) #define _mm512_getmant_round_pd(A, B, C, R) \ - (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \ - (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \ - (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) #define _mm512_getmant_pd(A, B, C) \ - (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_mask_getmant_pd(W, U, A, B, C) \ - (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_maskz_getmant_pd(U, A, B, C) \ - (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_getmant_round_ps(A, B, C, R) \ - (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ - (int)(((C)<<2) | (B)), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \ - (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ - (int)(((C)<<2) | (B)), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \ - (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ - (int)(((C)<<2) | (B)), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) #define _mm512_getmant_ps(A, B, C) \ - (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ - (int)(((C)<<2)|(B)), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ + (int)(((C)<<2)|(B)), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_mask_getmant_ps(W, U, A, B, C) \ - (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ - (int)(((C)<<2)|(B)), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ + (int)(((C)<<2)|(B)), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_maskz_getmant_ps(U, A, B, C) \ - (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ - (int)(((C)<<2)|(B)), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION) + ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ + (int)(((C)<<2)|(B)), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm512_getexp_round_pd(A, R) \ - (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_getexp_round_pd(W, U, A, R) \ - (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_getexp_round_pd(U, A, R) \ - (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_getexp_pd (__m512d __A) @@ -7572,19 +7572,19 @@ _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) } #define _mm512_getexp_round_ps(A, R) \ - (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)) + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, (int)(R))) #define _mm512_mask_getexp_round_ps(W, U, A, R) \ - (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) #define _mm512_maskz_getexp_round_ps(U, A, R) \ - (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)) + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_getexp_ps (__m512 __A) @@ -7614,100 +7614,100 @@ _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) } #define _mm512_i64gather_ps(index, addr, scale) \ - (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), (__mmask8)-1, \ - (int)(scale)) + ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), (__mmask8)-1, \ + (int)(scale))) #define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \ - (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\ - (void const *)(addr), \ - (__v8di)(__m512i)(index), \ - (__mmask8)(mask), (int)(scale)) - -#define _mm512_i64gather_epi32(index, addr, scale) \ - (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \ + ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\ (void const *)(addr), \ (__v8di)(__m512i)(index), \ - (__mmask8)-1, (int)(scale)) + (__mmask8)(mask), (int)(scale))) + +#define _mm512_i64gather_epi32(index, addr, scale) \ + ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), \ + (__mmask8)-1, (int)(scale))) #define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \ - (__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm512_i64gather_pd(index, addr, scale) \ - (__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), (__mmask8)-1, \ - (int)(scale)) + ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), (__mmask8)-1, \ + (int)(scale))) #define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \ - (__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm512_i64gather_epi64(index, addr, scale) \ - (__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), (__mmask8)-1, \ - (int)(scale)) + ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), (__mmask8)-1, \ + (int)(scale))) #define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \ - (__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm512_i32gather_ps(index, addr, scale) \ - (__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \ - (void const *)(addr), \ - (__v16si)(__m512)(index), \ - (__mmask16)-1, (int)(scale)) + ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \ + (void const *)(addr), \ + (__v16si)(__m512)(index), \ + (__mmask16)-1, (int)(scale))) #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \ - (__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \ - (void const *)(addr), \ - (__v16si)(__m512)(index), \ - (__mmask16)(mask), (int)(scale)) + ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \ + (void const *)(addr), \ + (__v16si)(__m512)(index), \ + (__mmask16)(mask), (int)(scale))) #define _mm512_i32gather_epi32(index, addr, scale) \ - (__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \ - (void const *)(addr), \ - (__v16si)(__m512i)(index), \ - (__mmask16)-1, (int)(scale)) + ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \ + (void const *)(addr), \ + (__v16si)(__m512i)(index), \ + (__mmask16)-1, (int)(scale))) #define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \ - (__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \ - (void const *)(addr), \ - (__v16si)(__m512i)(index), \ - (__mmask16)(mask), (int)(scale)) + ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \ + (void const *)(addr), \ + (__v16si)(__m512i)(index), \ + (__mmask16)(mask), (int)(scale))) #define _mm512_i32gather_pd(index, addr, scale) \ - (__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \ - (void const *)(addr), \ - (__v8si)(__m256i)(index), (__mmask8)-1, \ - (int)(scale)) + ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \ + (void const *)(addr), \ + (__v8si)(__m256i)(index), (__mmask8)-1, \ + (int)(scale))) #define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \ - (__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \ - (void const *)(addr), \ - (__v8si)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \ + (void const *)(addr), \ + (__v8si)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm512_i32gather_epi64(index, addr, scale) \ - (__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \ - (void const *)(addr), \ - (__v8si)(__m256i)(index), (__mmask8)-1, \ - (int)(scale)) + ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \ + (void const *)(addr), \ + (__v8si)(__m256i)(index), (__mmask8)-1, \ + (int)(scale))) #define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \ - (__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \ - (void const *)(addr), \ - (__v8si)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \ + (void const *)(addr), \ + (__v8si)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm512_i64scatter_ps(addr, index, v1, scale) \ __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \ @@ -7800,16 +7800,16 @@ _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) } #define _mm_fmadd_round_ss(A, B, C, R) \ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(C), (__mmask8)-1, \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(C), (__mmask8)-1, \ + (int)(R))) #define _mm_mask_fmadd_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + (__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) @@ -7822,10 +7822,10 @@ _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) } #define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ - (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(C), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(C), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) @@ -7838,10 +7838,10 @@ _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) } #define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \ - (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ + (__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) @@ -7854,16 +7854,16 @@ _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) } #define _mm_fmsub_round_ss(A, B, C, R) \ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - -(__v4sf)(__m128)(C), (__mmask8)-1, \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + -(__v4sf)(__m128)(C), (__mmask8)-1, \ + (int)(R))) #define _mm_mask_fmsub_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + (__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) @@ -7876,10 +7876,10 @@ _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) } #define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ - (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - -(__v4sf)(__m128)(C), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + -(__v4sf)(__m128)(C), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) @@ -7892,10 +7892,10 @@ _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) } #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \ - (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ + (__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) @@ -7908,16 +7908,16 @@ _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) } #define _mm_fnmadd_round_ss(A, B, C, R) \ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(C), (__mmask8)-1, \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(C), (__mmask8)-1, \ + (int)(R))) #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ - -(__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) @@ -7930,10 +7930,10 @@ _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) } #define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ - (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(C), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(C), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) @@ -7946,10 +7946,10 @@ _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) } #define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \ - (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ - -(__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) @@ -7962,16 +7962,16 @@ _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) } #define _mm_fnmsub_round_ss(A, B, C, R) \ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), \ - -(__v4sf)(__m128)(C), (__mmask8)-1, \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ + -(__v4sf)(__m128)(C), (__mmask8)-1, \ + (int)(R))) #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ - -(__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) @@ -7984,10 +7984,10 @@ _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) } #define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ - (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), \ - -(__v4sf)(__m128)(C), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ + -(__v4sf)(__m128)(C), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) @@ -8000,10 +8000,10 @@ _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) } #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \ - (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ - -(__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__mmask8)(U), \ - (int)(R)) + ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) @@ -8016,16 +8016,16 @@ _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) } #define _mm_fmadd_round_sd(A, B, C, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(C), (__mmask8)-1, \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(C), (__mmask8)-1, \ + (int)(R))) #define _mm_mask_fmadd_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ - (__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), (__mmask8)(U), \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) @@ -8038,10 +8038,10 @@ _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) } #define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(C), (__mmask8)(U), \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(C), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) @@ -8054,10 +8054,10 @@ _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) } #define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ - (__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (__mmask8)(U), \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) @@ -8070,16 +8070,16 @@ _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) } #define _mm_fmsub_round_sd(A, B, C, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - -(__v2df)(__m128d)(C), (__mmask8)-1, \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + -(__v2df)(__m128d)(C), (__mmask8)-1, \ + (int)(R))) #define _mm_mask_fmsub_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ - (__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), (__mmask8)(U), \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) @@ -8092,10 +8092,10 @@ _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) } #define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - -(__v2df)(__m128d)(C), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + -(__v2df)(__m128d)(C), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) @@ -8108,10 +8108,10 @@ _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) } #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \ - (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ - (__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) @@ -8124,16 +8124,16 @@ _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) } #define _mm_fnmadd_round_sd(A, B, C, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(C), (__mmask8)-1, \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(C), (__mmask8)-1, \ + (int)(R))) #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ - -(__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), (__mmask8)(U), \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) @@ -8146,10 +8146,10 @@ _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) } #define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(C), (__mmask8)(U), \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(C), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) @@ -8162,10 +8162,10 @@ _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) } #define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ - -(__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (__mmask8)(U), \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) @@ -8178,16 +8178,16 @@ _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) } #define _mm_fnmsub_round_sd(A, B, C, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), \ - -(__v2df)(__m128d)(C), (__mmask8)-1, \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ + -(__v2df)(__m128d)(C), (__mmask8)-1, \ + (int)(R))) #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ - -(__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), (__mmask8)(U), \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), (__mmask8)(U), \ + (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) @@ -8200,11 +8200,11 @@ _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) } #define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ - (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), \ - -(__v2df)(__m128d)(C), \ - (__mmask8)(U), \ - (int)(R)) + ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ + -(__v2df)(__m128d)(C), \ + (__mmask8)(U), \ + (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) @@ -8217,36 +8217,36 @@ _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) } #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \ - (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ - -(__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__mmask8)(U), (int)(R))) #define _mm512_permutex_pd(X, C) \ - (__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)) + ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C))) #define _mm512_mask_permutex_pd(W, U, X, C) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_permutex_pd((X), (C)), \ - (__v8df)(__m512d)(W)) + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_permutex_pd((X), (C)), \ + (__v8df)(__m512d)(W))) #define _mm512_maskz_permutex_pd(U, X, C) \ - (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_permutex_pd((X), (C)), \ - (__v8df)_mm512_setzero_pd()) + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_permutex_pd((X), (C)), \ + (__v8df)_mm512_setzero_pd())) #define _mm512_permutex_epi64(X, C) \ - (__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)) + ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C))) #define _mm512_mask_permutex_epi64(W, U, X, C) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_permutex_epi64((X), (C)), \ - (__v8di)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_permutex_epi64((X), (C)), \ + (__v8di)(__m512i)(W))) #define _mm512_maskz_permutex_epi64(U, X, C) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_permutex_epi64((X), (C)), \ - (__v8di)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_permutex_epi64((X), (C)), \ + (__v8di)_mm512_setzero_si512())) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_permutexvar_pd (__m512i __X, __m512d __Y) @@ -8416,10 +8416,10 @@ _mm512_kxor (__mmask16 __A, __mmask16 __B) #define _kxor_mask16 _mm512_kxor #define _kshiftli_mask16(A, I) \ - (__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)) + ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I))) #define _kshiftri_mask16(A, I) \ - (__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)) + ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I))) static __inline__ unsigned int __DEFAULT_FN_ATTRS _cvtmask16_u32(__mmask16 __A) { @@ -8538,48 +8538,48 @@ _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) } #define _mm_cmp_round_ss_mask(X, Y, P, R) \ - (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), \ - (__mmask8)-1, (int)(R)) + ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ - (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), \ - (__mmask8)(M), (int)(R)) + ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)(M), (int)(R))) #define _mm_cmp_ss_mask(X, Y, P) \ - (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_mask_cmp_ss_mask(M, X, Y, P) \ - (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), \ - (__mmask8)(M), \ - _MM_FROUND_CUR_DIRECTION) + ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)(M), \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_cmp_round_sd_mask(X, Y, P, R) \ - (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P), \ - (__mmask8)-1, (int)(R)) + ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ - (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P), \ - (__mmask8)(M), (int)(R)) + ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P), \ + (__mmask8)(M), (int)(R))) #define _mm_cmp_sd_mask(X, Y, P) \ - (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION) + ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) #define _mm_mask_cmp_sd_mask(M, X, Y, P) \ - (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P), \ - (__mmask8)(M), \ - _MM_FROUND_CUR_DIRECTION) + ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P), \ + (__mmask8)(M), \ + _MM_FROUND_CUR_DIRECTION)) /* Bit Test */ @@ -8760,17 +8760,17 @@ _mm_maskz_load_sd (__mmask8 __U, const double* __A) } #define _mm512_shuffle_epi32(A, I) \ - (__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)) + ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I))) #define _mm512_mask_shuffle_epi32(W, U, A, I) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shuffle_epi32((A), (I)), \ - (__v16si)(__m512i)(W)) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shuffle_epi32((A), (I)), \ + (__v16si)(__m512i)(W))) #define _mm512_maskz_shuffle_epi32(U, A, I) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shuffle_epi32((A), (I)), \ - (__v16si)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shuffle_epi32((A), (I)), \ + (__v16si)_mm512_setzero_si512())) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) @@ -8901,19 +8901,19 @@ _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) } #define _mm512_cvt_roundps_pd(A, R) \ - (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm512_mask_cvt_roundps_pd(W, U, A, R) \ - (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm512_maskz_cvt_roundps_pd(U, A, R) \ - (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtps_pd (__m256 __A) @@ -9010,22 +9010,22 @@ _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) } #define _mm_cvt_roundsd_ss(A, B, R) \ - (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ - (__v2df)(__m128d)(B), \ - (__v4sf)_mm_undefined_ps(), \ - (__mmask8)-1, (int)(R)) + ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ + (__v2df)(__m128d)(B), \ + (__v4sf)_mm_undefined_ps(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \ - (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ - (__v2df)(__m128d)(B), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ + (__v2df)(__m128d)(B), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(R))) #define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \ - (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ - (__v2df)(__m128d)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)) + ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ + (__v2df)(__m128d)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) @@ -9058,47 +9058,47 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) #ifdef __x86_64__ #define _mm_cvt_roundi64_sd(A, B, R) \ - (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ - (int)(R)) + ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ + (int)(R))) #define _mm_cvt_roundsi64_sd(A, B, R) \ - (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ - (int)(R)) + ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ + (int)(R))) #endif #define _mm_cvt_roundsi32_ss(A, B, R) \ - (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)) + ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))) #define _mm_cvt_roundi32_ss(A, B, R) \ - (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)) + ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))) #ifdef __x86_64__ #define _mm_cvt_roundsi64_ss(A, B, R) \ - (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ - (int)(R)) + ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ + (int)(R))) #define _mm_cvt_roundi64_ss(A, B, R) \ - (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ - (int)(R)) + ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ + (int)(R))) #endif #define _mm_cvt_roundss_sd(A, B, R) \ - (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ - (__v4sf)(__m128)(B), \ - (__v2df)_mm_undefined_pd(), \ - (__mmask8)-1, (int)(R)) + ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ + (__v4sf)(__m128)(B), \ + (__v2df)_mm_undefined_pd(), \ + (__mmask8)-1, (int)(R))) #define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \ - (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ - (__v4sf)(__m128)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ + (__v4sf)(__m128)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) #define _mm_maskz_cvt_roundss_sd(U, A, B, R) \ - (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ - (__v4sf)(__m128)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)) + ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ + (__v4sf)(__m128)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) @@ -9127,8 +9127,8 @@ _mm_cvtu32_sd (__m128d __A, unsigned __B) #ifdef __x86_64__ #define _mm_cvt_roundu64_sd(A, B, R) \ - (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \ - (unsigned long long)(B), (int)(R)) + ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \ + (unsigned long long)(B), (int)(R))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtu64_sd (__m128d __A, unsigned long long __B) @@ -9139,8 +9139,8 @@ _mm_cvtu64_sd (__m128d __A, unsigned long long __B) #endif #define _mm_cvt_roundu32_ss(A, B, R) \ - (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \ - (int)(R)) + ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \ + (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtu32_ss (__m128 __A, unsigned __B) @@ -9151,8 +9151,8 @@ _mm_cvtu32_ss (__m128 __A, unsigned __B) #ifdef __x86_64__ #define _mm_cvt_roundu64_ss(A, B, R) \ - (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \ - (unsigned long long)(B), (int)(R)) + ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \ + (unsigned long long)(B), (int)(R))) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtu64_ss (__m128 __A, unsigned long long __B) diff --git a/clang/lib/Headers/avx512vbmi2intrin.h b/clang/lib/Headers/avx512vbmi2intrin.h index a23144616ce36..17fa77722c64f 100644 --- a/clang/lib/Headers/avx512vbmi2intrin.h +++ b/clang/lib/Headers/avx512vbmi2intrin.h @@ -129,88 +129,88 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) } #define _mm512_shldi_epi64(A, B, I) \ - (__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(I)) + ((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(I))) #define _mm512_mask_shldi_epi64(S, U, A, B, I) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shldi_epi64((A), (B), (I)), \ - (__v8di)(__m512i)(S)) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shldi_epi64((A), (B), (I)), \ + (__v8di)(__m512i)(S))) #define _mm512_maskz_shldi_epi64(U, A, B, I) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shldi_epi64((A), (B), (I)), \ - (__v8di)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shldi_epi64((A), (B), (I)), \ + (__v8di)_mm512_setzero_si512())) #define _mm512_shldi_epi32(A, B, I) \ - (__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(I)) + ((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (int)(I))) #define _mm512_mask_shldi_epi32(S, U, A, B, I) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shldi_epi32((A), (B), (I)), \ - (__v16si)(__m512i)(S)) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shldi_epi32((A), (B), (I)), \ + (__v16si)(__m512i)(S))) #define _mm512_maskz_shldi_epi32(U, A, B, I) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shldi_epi32((A), (B), (I)), \ - (__v16si)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shldi_epi32((A), (B), (I)), \ + (__v16si)_mm512_setzero_si512())) #define _mm512_shldi_epi16(A, B, I) \ - (__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \ - (__v32hi)(__m512i)(B), (int)(I)) + ((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B), (int)(I))) #define _mm512_mask_shldi_epi16(S, U, A, B, I) \ - (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ - (__v32hi)(__m512i)(S)) + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ + (__v32hi)(__m512i)(S))) #define _mm512_maskz_shldi_epi16(U, A, B, I) \ - (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ - (__v32hi)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ + (__v32hi)_mm512_setzero_si512())) #define _mm512_shrdi_epi64(A, B, I) \ - (__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(I)) + ((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(I))) #define _mm512_mask_shrdi_epi64(S, U, A, B, I) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ - (__v8di)(__m512i)(S)) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ + (__v8di)(__m512i)(S))) #define _mm512_maskz_shrdi_epi64(U, A, B, I) \ - (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ - (__v8di)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ + (__v8di)_mm512_setzero_si512())) #define _mm512_shrdi_epi32(A, B, I) \ - (__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(I)) + ((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (int)(I))) #define _mm512_mask_shrdi_epi32(S, U, A, B, I) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ - (__v16si)(__m512i)(S)) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ + (__v16si)(__m512i)(S))) #define _mm512_maskz_shrdi_epi32(U, A, B, I) \ - (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ - (__v16si)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ + (__v16si)_mm512_setzero_si512())) #define _mm512_shrdi_epi16(A, B, I) \ - (__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \ - (__v32hi)(__m512i)(B), (int)(I)) + ((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B), (int)(I))) #define _mm512_mask_shrdi_epi16(S, U, A, B, I) \ - (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ - (__v32hi)(__m512i)(S)) + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ + (__v32hi)(__m512i)(S))) #define _mm512_maskz_shrdi_epi16(U, A, B, I) \ - (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ - (__v32hi)_mm512_setzero_si512()) + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ + (__v32hi)_mm512_setzero_si512())) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C) diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index 6ed10ed9803ba..7873516053ece 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -21,84 +21,84 @@ /* Integer compare */ #define _mm_cmp_epi8_mask(a, b, p) \ - (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)-1) + ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)-1)) #define _mm_mask_cmp_epi8_mask(m, a, b, p) \ - (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)(m)) + ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)(m))) #define _mm_cmp_epu8_mask(a, b, p) \ - (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)-1) + ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)-1)) #define _mm_mask_cmp_epu8_mask(m, a, b, p) \ - (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)(m)) + ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)(m))) #define _mm256_cmp_epi8_mask(a, b, p) \ - (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)-1) + ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)-1)) #define _mm256_mask_cmp_epi8_mask(m, a, b, p) \ - (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)(m)) + ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)(m))) #define _mm256_cmp_epu8_mask(a, b, p) \ - (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)-1) + ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)-1)) #define _mm256_mask_cmp_epu8_mask(m, a, b, p) \ - (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)(m)) + ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)(m))) #define _mm_cmp_epi16_mask(a, b, p) \ - (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)-1)) #define _mm_mask_cmp_epi16_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)(m))) #define _mm_cmp_epu16_mask(a, b, p) \ - (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)-1)) #define _mm_mask_cmp_epu16_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)(m))) #define _mm256_cmp_epi16_mask(a, b, p) \ - (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)-1) + ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)-1)) #define _mm256_mask_cmp_epi16_mask(m, a, b, p) \ - (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)(m)) + ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)(m))) #define _mm256_cmp_epu16_mask(a, b, p) \ - (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)-1) + ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)-1)) #define _mm256_mask_cmp_epu16_mask(m, a, b, p) \ - (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)(m)) + ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)(m))) #define _mm_cmpeq_epi8_mask(A, B) \ _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) @@ -1821,46 +1821,46 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) #define _mm_mask_shufflehi_epi16(W, U, A, imm) \ - (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ - (__v8hi)(__m128i)(W)) + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ + (__v8hi)(__m128i)(W))) #define _mm_maskz_shufflehi_epi16(U, A, imm) \ - (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ - (__v8hi)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ + (__v8hi)_mm_setzero_si128())) #define _mm256_mask_shufflehi_epi16(W, U, A, imm) \ - (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ - (__v16hi)(__m256i)(W)) + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ + (__v16hi)(__m256i)(W))) #define _mm256_maskz_shufflehi_epi16(U, A, imm) \ - (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ - (__v16hi)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ + (__v16hi)_mm256_setzero_si256())) #define _mm_mask_shufflelo_epi16(W, U, A, imm) \ - (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shufflelo_epi16((A), (imm)), \ - (__v8hi)(__m128i)(W)) + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shufflelo_epi16((A), (imm)), \ + (__v8hi)(__m128i)(W))) #define _mm_maskz_shufflelo_epi16(U, A, imm) \ - (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shufflelo_epi16((A), (imm)), \ - (__v8hi)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shufflelo_epi16((A), (imm)), \ + (__v8hi)_mm_setzero_si128())) #define _mm256_mask_shufflelo_epi16(W, U, A, imm) \ - (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shufflelo_epi16((A), \ - (imm)), \ - (__v16hi)(__m256i)(W)) + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shufflelo_epi16((A), \ + (imm)), \ + (__v16hi)(__m256i)(W))) #define _mm256_maskz_shufflelo_epi16(U, A, imm) \ - (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shufflelo_epi16((A), \ - (imm)), \ - (__v16hi)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shufflelo_epi16((A), \ + (imm)), \ + (__v16hi)_mm256_setzero_si256())) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi16(__m256i __A, __m256i __B) @@ -2756,52 +2756,52 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A, } #define _mm_mask_alignr_epi8(W, U, A, B, N) \ - (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ - (__v16qi)(__m128i)(W)) + (__v16qi)(__m128i)(W))) #define _mm_maskz_alignr_epi8(U, A, B, N) \ - (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ - (__v16qi)_mm_setzero_si128()) + (__v16qi)_mm_setzero_si128())) #define _mm256_mask_alignr_epi8(W, U, A, B, N) \ - (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ - (__v32qi)(__m256i)(W)) + (__v32qi)(__m256i)(W))) #define _mm256_maskz_alignr_epi8(U, A, B, N) \ - (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ - (__v32qi)_mm256_setzero_si256()) + (__v32qi)_mm256_setzero_si256())) #define _mm_dbsad_epu8(A, B, imm) \ - (__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(imm)) + ((__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(imm))) #define _mm_mask_dbsad_epu8(W, U, A, B, imm) \ - (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \ - (__v8hi)(__m128i)(W)) + (__v8hi)(__m128i)(W))) #define _mm_maskz_dbsad_epu8(U, A, B, imm) \ - (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \ - (__v8hi)_mm_setzero_si128()) + (__v8hi)_mm_setzero_si128())) #define _mm256_dbsad_epu8(A, B, imm) \ - (__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \ - (__v32qi)(__m256i)(B), (int)(imm)) + ((__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), (int)(imm))) #define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \ - (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ - (__v16hi)(__m256i)(W)) + (__v16hi)(__m256i)(W))) #define _mm256_maskz_dbsad_epu8(U, A, B, imm) \ - (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ - (__v16hi)_mm256_setzero_si256()) + (__v16hi)_mm256_setzero_si256())) #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h index 95ba574ea8210..713e1a18a1b3f 100644 --- a/clang/lib/Headers/avx512vldqintrin.h +++ b/clang/lib/Headers/avx512vldqintrin.h @@ -773,134 +773,134 @@ _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) { } #define _mm_range_pd(A, B, C) \ - (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1) + ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1)) #define _mm_mask_range_pd(W, U, A, B, C) \ - (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) #define _mm_maskz_range_pd(U, A, B, C) \ - (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) #define _mm256_range_pd(A, B, C) \ - (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), (int)(C), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1) + ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)-1)) #define _mm256_mask_range_pd(W, U, A, B, C) \ - (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), (int)(C), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U)) + ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) #define _mm256_maskz_range_pd(U, A, B, C) \ - (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), (int)(C), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)) + ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U))) #define _mm_range_ps(A, B, C) \ - (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1) + ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1)) #define _mm_mask_range_ps(W, U, A, B, C) \ - (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)(__m128)(W), (__mmask8)(U)) + ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)(__m128)(W), (__mmask8)(U))) #define _mm_maskz_range_ps(U, A, B, C) \ - (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)) + ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U))) #define _mm256_range_ps(A, B, C) \ - (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(C), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1) + ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1)) #define _mm256_mask_range_ps(W, U, A, B, C) \ - (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(C), \ - (__v8sf)(__m256)(W), (__mmask8)(U)) + ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)(__m256)(W), (__mmask8)(U))) #define _mm256_maskz_range_ps(U, A, B, C) \ - (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(C), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)) + ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U))) #define _mm_reduce_pd(A, B) \ - (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1) + ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1)) #define _mm_mask_reduce_pd(W, U, A, B) \ - (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) #define _mm_maskz_reduce_pd(U, A, B) \ - (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) #define _mm256_reduce_pd(A, B) \ - (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1) + ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)-1)) #define _mm256_mask_reduce_pd(W, U, A, B) \ - (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U)) + ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) #define _mm256_maskz_reduce_pd(U, A, B) \ - (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)) + ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U))) #define _mm_reduce_ps(A, B) \ - (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1) + ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1)) #define _mm_mask_reduce_ps(W, U, A, B) \ - (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U)) + ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) #define _mm_maskz_reduce_ps(U, A, B) \ - (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)) + ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U))) #define _mm256_reduce_ps(A, B) \ - (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1) + ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1)) #define _mm256_mask_reduce_ps(W, U, A, B) \ - (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U)) + ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U))) #define _mm256_maskz_reduce_ps(U, A, B) \ - (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)) + ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U))) static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_movepi32_mask (__m128i __A) @@ -1066,100 +1066,100 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) } #define _mm256_extractf64x2_pd(A, imm) \ - (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v2df)_mm_undefined_pd(), \ - (__mmask8)-1) + ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v2df)_mm_undefined_pd(), \ + (__mmask8)-1)) #define _mm256_mask_extractf64x2_pd(W, U, A, imm) \ - (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) #define _mm256_maskz_extractf64x2_pd(U, A, imm) \ - (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) #define _mm256_extracti64x2_epi64(A, imm) \ - (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ + ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ (int)(imm), \ (__v2di)_mm_undefined_si128(), \ - (__mmask8)-1) + (__mmask8)-1)) #define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \ - (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ - (int)(imm), \ - (__v2di)(__m128i)(W), \ - (__mmask8)(U)) + ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ + (int)(imm), \ + (__v2di)(__m128i)(W), \ + (__mmask8)(U))) #define _mm256_maskz_extracti64x2_epi64(U, A, imm) \ - (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ - (int)(imm), \ - (__v2di)_mm_setzero_si128(), \ - (__mmask8)(U)) + ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ + (int)(imm), \ + (__v2di)_mm_setzero_si128(), \ + (__mmask8)(U))) #define _mm256_insertf64x2(A, B, imm) \ - (__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \ - (__v2df)(__m128d)(B), (int)(imm)) + ((__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \ + (__v2df)(__m128d)(B), (int)(imm))) #define _mm256_mask_insertf64x2(W, U, A, B, imm) \ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ - (__v4df)(__m256d)(W)) + (__v4df)(__m256d)(W))) #define _mm256_maskz_insertf64x2(U, A, B, imm) \ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ - (__v4df)_mm256_setzero_pd()) + (__v4df)_mm256_setzero_pd())) #define _mm256_inserti64x2(A, B, imm) \ - (__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \ - (__v2di)(__m128i)(B), (int)(imm)) + ((__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \ + (__v2di)(__m128i)(B), (int)(imm))) #define _mm256_mask_inserti64x2(W, U, A, B, imm) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ - (__v4di)(__m256i)(W)) + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ + (__v4di)(__m256i)(W))) #define _mm256_maskz_inserti64x2(U, A, B, imm) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ - (__v4di)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256())) #define _mm_mask_fpclass_pd_mask(U, A, imm) \ - (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ - (__mmask8)(U)) + ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ + (__mmask8)(U))) #define _mm_fpclass_pd_mask(A, imm) \ - (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ + (__mmask8)-1)) #define _mm256_mask_fpclass_pd_mask(U, A, imm) \ - (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ - (__mmask8)(U)) + ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ + (__mmask8)(U))) #define _mm256_fpclass_pd_mask(A, imm) \ - (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ + (__mmask8)-1)) #define _mm_mask_fpclass_ps_mask(U, A, imm) \ - (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__mmask8)(U)) + ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__mmask8)(U))) #define _mm_fpclass_ps_mask(A, imm) \ - (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__mmask8)-1)) #define _mm256_mask_fpclass_ps_mask(U, A, imm) \ - (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ - (__mmask8)(U)) + ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ + (__mmask8)(U))) #define _mm256_fpclass_ps_mask(A, imm) \ - (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ + (__mmask8)-1)) #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 968c10efeac0c..0519dba59081a 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -771,124 +771,124 @@ _mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B) } #define _mm_cmp_epi32_mask(a, b, p) \ - (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \ - (__v4si)(__m128i)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \ + (__v4si)(__m128i)(b), (int)(p), \ + (__mmask8)-1)) #define _mm_mask_cmp_epi32_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \ - (__v4si)(__m128i)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \ + (__v4si)(__m128i)(b), (int)(p), \ + (__mmask8)(m))) #define _mm_cmp_epu32_mask(a, b, p) \ - (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \ - (__v4si)(__m128i)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \ + (__v4si)(__m128i)(b), (int)(p), \ + (__mmask8)-1)) #define _mm_mask_cmp_epu32_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \ - (__v4si)(__m128i)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \ + (__v4si)(__m128i)(b), (int)(p), \ + (__mmask8)(m))) #define _mm256_cmp_epi32_mask(a, b, p) \ - (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \ - (__v8si)(__m256i)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \ + (__v8si)(__m256i)(b), (int)(p), \ + (__mmask8)-1)) #define _mm256_mask_cmp_epi32_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \ - (__v8si)(__m256i)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \ + (__v8si)(__m256i)(b), (int)(p), \ + (__mmask8)(m))) #define _mm256_cmp_epu32_mask(a, b, p) \ - (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \ - (__v8si)(__m256i)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \ + (__v8si)(__m256i)(b), (int)(p), \ + (__mmask8)-1)) #define _mm256_mask_cmp_epu32_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \ - (__v8si)(__m256i)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \ + (__v8si)(__m256i)(b), (int)(p), \ + (__mmask8)(m))) #define _mm_cmp_epi64_mask(a, b, p) \ - (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \ - (__v2di)(__m128i)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \ + (__v2di)(__m128i)(b), (int)(p), \ + (__mmask8)-1)) #define _mm_mask_cmp_epi64_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \ - (__v2di)(__m128i)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \ + (__v2di)(__m128i)(b), (int)(p), \ + (__mmask8)(m))) #define _mm_cmp_epu64_mask(a, b, p) \ - (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \ - (__v2di)(__m128i)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \ + (__v2di)(__m128i)(b), (int)(p), \ + (__mmask8)-1)) #define _mm_mask_cmp_epu64_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \ - (__v2di)(__m128i)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \ + (__v2di)(__m128i)(b), (int)(p), \ + (__mmask8)(m))) #define _mm256_cmp_epi64_mask(a, b, p) \ - (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \ - (__v4di)(__m256i)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \ + (__v4di)(__m256i)(b), (int)(p), \ + (__mmask8)-1)) #define _mm256_mask_cmp_epi64_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \ - (__v4di)(__m256i)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \ + (__v4di)(__m256i)(b), (int)(p), \ + (__mmask8)(m))) #define _mm256_cmp_epu64_mask(a, b, p) \ - (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \ - (__v4di)(__m256i)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \ + (__v4di)(__m256i)(b), (int)(p), \ + (__mmask8)-1)) #define _mm256_mask_cmp_epu64_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \ - (__v4di)(__m256i)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \ + (__v4di)(__m256i)(b), (int)(p), \ + (__mmask8)(m))) #define _mm256_cmp_ps_mask(a, b, p) \ - (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \ - (__v8sf)(__m256)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), (int)(p), \ + (__mmask8)-1)) #define _mm256_mask_cmp_ps_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \ - (__v8sf)(__m256)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), (int)(p), \ + (__mmask8)(m))) #define _mm256_cmp_pd_mask(a, b, p) \ - (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \ - (__v4df)(__m256d)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \ + (__v4df)(__m256d)(b), (int)(p), \ + (__mmask8)-1)) #define _mm256_mask_cmp_pd_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \ - (__v4df)(__m256d)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \ + (__v4df)(__m256d)(b), (int)(p), \ + (__mmask8)(m))) #define _mm_cmp_ps_mask(a, b, p) \ - (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \ - (__v4sf)(__m128)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \ + (__v4sf)(__m128)(b), (int)(p), \ + (__mmask8)-1)) #define _mm_mask_cmp_ps_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \ - (__v4sf)(__m128)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \ + (__v4sf)(__m128)(b), (int)(p), \ + (__mmask8)(m))) #define _mm_cmp_pd_mask(a, b, p) \ - (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \ - (__v2df)(__m128d)(b), (int)(p), \ - (__mmask8)-1) + ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \ + (__v2df)(__m128d)(b), (int)(p), \ + (__mmask8)-1)) #define _mm_mask_cmp_pd_mask(m, a, b, p) \ - (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \ - (__v2df)(__m128d)(b), (int)(p), \ - (__mmask8)(m)) + ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \ + (__v2df)(__m128d)(b), (int)(p), \ + (__mmask8)(m))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) @@ -3289,78 +3289,78 @@ _mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) { } #define _mm_roundscale_pd(A, imm) \ - (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ - (int)(imm), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1) + ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ + (int)(imm), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1)) #define _mm_mask_roundscale_pd(W, U, A, imm) \ - (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ - (int)(imm), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ + (int)(imm), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) #define _mm_maskz_roundscale_pd(U, A, imm) \ - (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ - (int)(imm), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ + (int)(imm), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) #define _mm256_roundscale_pd(A, imm) \ - (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1) + ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)-1)) #define _mm256_mask_roundscale_pd(W, U, A, imm) \ - (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U)) + ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) #define _mm256_maskz_roundscale_pd(U, A, imm) \ - (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)) + ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U))) #define _mm_roundscale_ps(A, imm) \ - (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1) + ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1)) #define _mm_mask_roundscale_ps(W, U, A, imm) \ - (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U)) + ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) #define _mm_maskz_roundscale_ps(U, A, imm) \ - (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)) + ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U))) #define _mm256_roundscale_ps(A, imm) \ - (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1) + ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1)) #define _mm256_mask_roundscale_ps(W, U, A, imm) \ - (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U)) + ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U))) #define _mm256_maskz_roundscale_ps(U, A, imm) \ - (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)) + ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_scalef_pd (__m128d __A, __m128d __B) { @@ -4298,56 +4298,56 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { #define _mm_rol_epi32(a, b) \ - (__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b)) + ((__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b))) #define _mm_mask_rol_epi32(w, u, a, b) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ - (__v4si)_mm_rol_epi32((a), (b)), \ - (__v4si)(__m128i)(w)) + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ + (__v4si)_mm_rol_epi32((a), (b)), \ + (__v4si)(__m128i)(w))) #define _mm_maskz_rol_epi32(u, a, b) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ - (__v4si)_mm_rol_epi32((a), (b)), \ - (__v4si)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ + (__v4si)_mm_rol_epi32((a), (b)), \ + (__v4si)_mm_setzero_si128())) #define _mm256_rol_epi32(a, b) \ - (__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b)) + ((__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b))) #define _mm256_mask_rol_epi32(w, u, a, b) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ - (__v8si)_mm256_rol_epi32((a), (b)), \ - (__v8si)(__m256i)(w)) + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ + (__v8si)_mm256_rol_epi32((a), (b)), \ + (__v8si)(__m256i)(w))) #define _mm256_maskz_rol_epi32(u, a, b) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ - (__v8si)_mm256_rol_epi32((a), (b)), \ - (__v8si)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ + (__v8si)_mm256_rol_epi32((a), (b)), \ + (__v8si)_mm256_setzero_si256())) #define _mm_rol_epi64(a, b) \ - (__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b)) + ((__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b))) #define _mm_mask_rol_epi64(w, u, a, b) \ - (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ - (__v2di)_mm_rol_epi64((a), (b)), \ - (__v2di)(__m128i)(w)) + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ + (__v2di)_mm_rol_epi64((a), (b)), \ + (__v2di)(__m128i)(w))) #define _mm_maskz_rol_epi64(u, a, b) \ - (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ - (__v2di)_mm_rol_epi64((a), (b)), \ - (__v2di)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ + (__v2di)_mm_rol_epi64((a), (b)), \ + (__v2di)_mm_setzero_si128())) #define _mm256_rol_epi64(a, b) \ - (__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b)) + ((__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b))) #define _mm256_mask_rol_epi64(w, u, a, b) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ - (__v4di)_mm256_rol_epi64((a), (b)), \ - (__v4di)(__m256i)(w)) + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ + (__v4di)_mm256_rol_epi64((a), (b)), \ + (__v4di)(__m256i)(w))) #define _mm256_maskz_rol_epi64(u, a, b) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ - (__v4di)_mm256_rol_epi64((a), (b)), \ - (__v4di)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ + (__v4di)_mm256_rol_epi64((a), (b)), \ + (__v4di)_mm256_setzero_si256())) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_rolv_epi32 (__m128i __A, __m128i __B) @@ -4438,56 +4438,56 @@ _mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) } #define _mm_ror_epi32(a, b) \ - (__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b)) + ((__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b))) #define _mm_mask_ror_epi32(w, u, a, b) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ - (__v4si)_mm_ror_epi32((a), (b)), \ - (__v4si)(__m128i)(w)) + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ + (__v4si)_mm_ror_epi32((a), (b)), \ + (__v4si)(__m128i)(w))) #define _mm_maskz_ror_epi32(u, a, b) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ - (__v4si)_mm_ror_epi32((a), (b)), \ - (__v4si)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ + (__v4si)_mm_ror_epi32((a), (b)), \ + (__v4si)_mm_setzero_si128())) #define _mm256_ror_epi32(a, b) \ - (__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b)) + ((__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b))) #define _mm256_mask_ror_epi32(w, u, a, b) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ - (__v8si)_mm256_ror_epi32((a), (b)), \ - (__v8si)(__m256i)(w)) + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ + (__v8si)_mm256_ror_epi32((a), (b)), \ + (__v8si)(__m256i)(w))) #define _mm256_maskz_ror_epi32(u, a, b) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ - (__v8si)_mm256_ror_epi32((a), (b)), \ - (__v8si)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ + (__v8si)_mm256_ror_epi32((a), (b)), \ + (__v8si)_mm256_setzero_si256())) #define _mm_ror_epi64(a, b) \ - (__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b)) + ((__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b))) #define _mm_mask_ror_epi64(w, u, a, b) \ - (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ - (__v2di)_mm_ror_epi64((a), (b)), \ - (__v2di)(__m128i)(w)) + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ + (__v2di)_mm_ror_epi64((a), (b)), \ + (__v2di)(__m128i)(w))) #define _mm_maskz_ror_epi64(u, a, b) \ - (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ - (__v2di)_mm_ror_epi64((a), (b)), \ - (__v2di)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ + (__v2di)_mm_ror_epi64((a), (b)), \ + (__v2di)_mm_setzero_si128())) #define _mm256_ror_epi64(a, b) \ - (__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b)) + ((__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b))) #define _mm256_mask_ror_epi64(w, u, a, b) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ - (__v4di)_mm256_ror_epi64((a), (b)), \ - (__v4di)(__m256i)(w)) + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ + (__v4di)_mm256_ror_epi64((a), (b)), \ + (__v4di)(__m256i)(w))) #define _mm256_maskz_ror_epi64(u, a, b) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ - (__v4di)_mm256_ror_epi64((a), (b)), \ - (__v4di)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ + (__v4di)_mm256_ror_epi64((a), (b)), \ + (__v4di)_mm256_setzero_si256())) static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) @@ -5356,76 +5356,76 @@ _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) } #define _mm_fixupimm_pd(A, B, C, imm) \ - (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)-1) + ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)-1)) #define _mm_mask_fixupimm_pd(A, U, B, C, imm) \ - (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)(U))) #define _mm_maskz_fixupimm_pd(U, A, B, C, imm) \ - (__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), \ - (int)(imm), (__mmask8)(U)) + ((__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), \ + (int)(imm), (__mmask8)(U))) #define _mm256_fixupimm_pd(A, B, C, imm) \ - (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)-1) + ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), \ + (__v4di)(__m256i)(C), (int)(imm), \ + (__mmask8)-1)) #define _mm256_mask_fixupimm_pd(A, U, B, C, imm) \ - (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)) + ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), \ + (__v4di)(__m256i)(C), (int)(imm), \ + (__mmask8)(U))) #define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) \ - (__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (__v4di)(__m256i)(C), \ - (int)(imm), (__mmask8)(U)) + ((__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), \ + (__v4di)(__m256i)(C), \ + (int)(imm), (__mmask8)(U))) #define _mm_fixupimm_ps(A, B, C, imm) \ - (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)-1) + ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)-1)) #define _mm_mask_fixupimm_ps(A, U, B, C, imm) \ - (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)) - -#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) \ - (__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \ + ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)) + (__mmask8)(U))) + +#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) \ + ((__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U))) #define _mm256_fixupimm_ps(A, B, C, imm) \ - (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), \ - (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)-1) + ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), \ + (__v8si)(__m256i)(C), (int)(imm), \ + (__mmask8)-1)) #define _mm256_mask_fixupimm_ps(A, U, B, C, imm) \ - (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), \ - (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)) - -#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) \ - (__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \ + ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \ (__v8sf)(__m256)(B), \ (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)) + (__mmask8)(U))) + +#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) \ + ((__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), \ + (__v8si)(__m256i)(C), (int)(imm), \ + (__mmask8)(U))) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P) @@ -6033,44 +6033,44 @@ _mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A) } #define _mm_mask_permute_pd(W, U, X, C) \ - (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm_permute_pd((X), (C)), \ - (__v2df)(__m128d)(W)) + ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ + (__v2df)_mm_permute_pd((X), (C)), \ + (__v2df)(__m128d)(W))) #define _mm_maskz_permute_pd(U, X, C) \ - (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm_permute_pd((X), (C)), \ - (__v2df)_mm_setzero_pd()) + ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ + (__v2df)_mm_permute_pd((X), (C)), \ + (__v2df)_mm_setzero_pd())) #define _mm256_mask_permute_pd(W, U, X, C) \ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_permute_pd((X), (C)), \ - (__v4df)(__m256d)(W)) + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_permute_pd((X), (C)), \ + (__v4df)(__m256d)(W))) #define _mm256_maskz_permute_pd(U, X, C) \ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_permute_pd((X), (C)), \ - (__v4df)_mm256_setzero_pd()) + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_permute_pd((X), (C)), \ + (__v4df)_mm256_setzero_pd())) #define _mm_mask_permute_ps(W, U, X, C) \ - (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm_permute_ps((X), (C)), \ - (__v4sf)(__m128)(W)) + ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm_permute_ps((X), (C)), \ + (__v4sf)(__m128)(W))) #define _mm_maskz_permute_ps(U, X, C) \ - (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm_permute_ps((X), (C)), \ - (__v4sf)_mm_setzero_ps()) + ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm_permute_ps((X), (C)), \ + (__v4sf)_mm_setzero_ps())) #define _mm256_mask_permute_ps(W, U, X, C) \ - (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_permute_ps((X), (C)), \ - (__v8sf)(__m256)(W)) + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_permute_ps((X), (C)), \ + (__v8sf)(__m256)(W))) #define _mm256_maskz_permute_ps(U, X, C) \ - (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_permute_ps((X), (C)), \ - (__v8sf)_mm256_setzero_ps()) + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_permute_ps((X), (C)), \ + (__v8sf)_mm256_setzero_ps())) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) @@ -6526,175 +6526,175 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm) } #define _mm_ternarylogic_epi32(A, B, C, imm) \ - (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)-1) + ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)-1)) #define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) \ - (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)) - -#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \ - (__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \ + ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \ (__v4si)(__m128i)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)) + (__mmask8)(U))) + +#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \ + ((__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U))) #define _mm256_ternarylogic_epi32(A, B, C, imm) \ - (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)-1) + ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), \ + (__v8si)(__m256i)(C), (int)(imm), \ + (__mmask8)-1)) #define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) \ - (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)) - -#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \ - (__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \ + ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \ (__v8si)(__m256i)(B), \ (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)) + (__mmask8)(U))) + +#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \ + ((__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), \ + (__v8si)(__m256i)(C), (int)(imm), \ + (__mmask8)(U))) #define _mm_ternarylogic_epi64(A, B, C, imm) \ - (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)-1) + ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)-1)) #define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) \ - (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)) - -#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \ - (__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \ + ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \ (__v2di)(__m128i)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)) + (__mmask8)(U))) + +#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \ + ((__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)(U))) #define _mm256_ternarylogic_epi64(A, B, C, imm) \ - (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)-1) + ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (__v4di)(__m256i)(C), (int)(imm), \ + (__mmask8)-1)) #define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) \ - (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)) - -#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \ - (__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \ + ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \ (__v4di)(__m256i)(B), \ (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)) + (__mmask8)(U))) + +#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \ + ((__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (__v4di)(__m256i)(C), (int)(imm), \ + (__mmask8)(U))) #define _mm256_shuffle_f32x4(A, B, imm) \ - (__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(imm)) + ((__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(imm))) #define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) \ - (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ - (__v8sf)(__m256)(W)) + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8sf)(__m256)(W))) #define _mm256_maskz_shuffle_f32x4(U, A, B, imm) \ - (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ - (__v8sf)_mm256_setzero_ps()) + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8sf)_mm256_setzero_ps())) #define _mm256_shuffle_f64x2(A, B, imm) \ - (__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), (int)(imm)) + ((__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(imm))) #define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) \ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ - (__v4df)(__m256d)(W)) + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ + (__v4df)(__m256d)(W))) #define _mm256_maskz_shuffle_f64x2(U, A, B, imm) \ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ - (__v4df)_mm256_setzero_pd()) + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ + (__v4df)_mm256_setzero_pd())) #define _mm256_shuffle_i32x4(A, B, imm) \ - (__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (int)(imm)) + ((__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), (int)(imm))) #define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ - (__v8si)(__m256i)(W)) + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ + (__v8si)(__m256i)(W))) #define _mm256_maskz_shuffle_i32x4(U, A, B, imm) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ - (__v8si)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ + (__v8si)_mm256_setzero_si256())) #define _mm256_shuffle_i64x2(A, B, imm) \ - (__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(imm)) + ((__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(imm))) #define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ - (__v4di)(__m256i)(W)) + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ + (__v4di)(__m256i)(W))) #define _mm256_maskz_shuffle_i64x2(U, A, B, imm) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ - (__v4di)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256())) #define _mm_mask_shuffle_pd(W, U, A, B, M) \ - (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm_shuffle_pd((A), (B), (M)), \ - (__v2df)(__m128d)(W)) + ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ + (__v2df)_mm_shuffle_pd((A), (B), (M)), \ + (__v2df)(__m128d)(W))) #define _mm_maskz_shuffle_pd(U, A, B, M) \ - (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm_shuffle_pd((A), (B), (M)), \ - (__v2df)_mm_setzero_pd()) + ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ + (__v2df)_mm_shuffle_pd((A), (B), (M)), \ + (__v2df)_mm_setzero_pd())) #define _mm256_mask_shuffle_pd(W, U, A, B, M) \ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_shuffle_pd((A), (B), (M)), \ - (__v4df)(__m256d)(W)) + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_pd((A), (B), (M)), \ + (__v4df)(__m256d)(W))) #define _mm256_maskz_shuffle_pd(U, A, B, M) \ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_shuffle_pd((A), (B), (M)), \ - (__v4df)_mm256_setzero_pd()) + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_pd((A), (B), (M)), \ + (__v4df)_mm256_setzero_pd())) #define _mm_mask_shuffle_ps(W, U, A, B, M) \ - (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm_shuffle_ps((A), (B), (M)), \ - (__v4sf)(__m128)(W)) + ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm_shuffle_ps((A), (B), (M)), \ + (__v4sf)(__m128)(W))) #define _mm_maskz_shuffle_ps(U, A, B, M) \ - (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm_shuffle_ps((A), (B), (M)), \ - (__v4sf)_mm_setzero_ps()) + ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm_shuffle_ps((A), (B), (M)), \ + (__v4sf)_mm_setzero_ps())) #define _mm256_mask_shuffle_ps(W, U, A, B, M) \ - (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ - (__v8sf)(__m256)(W)) + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ + (__v8sf)(__m256)(W))) #define _mm256_maskz_shuffle_ps(U, A, B, M) \ - (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ - (__v8sf)_mm256_setzero_ps()) + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ + (__v8sf)_mm256_setzero_ps())) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_rsqrt14_pd (__m128d __A) @@ -7834,262 +7834,262 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) } #define _mm256_extractf32x4_ps(A, imm) \ - (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ - (int)(imm), \ - (__v4sf)_mm_undefined_ps(), \ - (__mmask8)-1) + ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ + (int)(imm), \ + (__v4sf)_mm_undefined_ps(), \ + (__mmask8)-1)) #define _mm256_mask_extractf32x4_ps(W, U, A, imm) \ - (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ - (int)(imm), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U)) + ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ + (int)(imm), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) #define _mm256_maskz_extractf32x4_ps(U, A, imm) \ - (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ - (int)(imm), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)) + ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ + (int)(imm), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U))) #define _mm256_extracti32x4_epi32(A, imm) \ - (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ - (int)(imm), \ - (__v4si)_mm_undefined_si128(), \ - (__mmask8)-1) + ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ + (int)(imm), \ + (__v4si)_mm_undefined_si128(), \ + (__mmask8)-1)) #define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \ - (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ - (int)(imm), \ - (__v4si)(__m128i)(W), \ - (__mmask8)(U)) + ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ + (int)(imm), \ + (__v4si)(__m128i)(W), \ + (__mmask8)(U))) #define _mm256_maskz_extracti32x4_epi32(U, A, imm) \ - (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ - (int)(imm), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U)) + ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ + (int)(imm), \ + (__v4si)_mm_setzero_si128(), \ + (__mmask8)(U))) #define _mm256_insertf32x4(A, B, imm) \ - (__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \ - (__v4sf)(__m128)(B), (int)(imm)) + ((__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \ + (__v4sf)(__m128)(B), (int)(imm))) #define _mm256_mask_insertf32x4(W, U, A, B, imm) \ - (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ - (__v8sf)(__m256)(W)) + (__v8sf)(__m256)(W))) #define _mm256_maskz_insertf32x4(U, A, B, imm) \ - (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ - (__v8sf)_mm256_setzero_ps()) + (__v8sf)_mm256_setzero_ps())) #define _mm256_inserti32x4(A, B, imm) \ - (__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \ - (__v4si)(__m128i)(B), (int)(imm)) + ((__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \ + (__v4si)(__m128i)(B), (int)(imm))) #define _mm256_mask_inserti32x4(W, U, A, B, imm) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ - (__v8si)(__m256i)(W)) + (__v8si)(__m256i)(W))) #define _mm256_maskz_inserti32x4(U, A, B, imm) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ - (__v8si)_mm256_setzero_si256()) + (__v8si)_mm256_setzero_si256())) #define _mm_getmant_pd(A, B, C) \ - (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1) + ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1)) #define _mm_mask_getmant_pd(W, U, A, B, C) \ - (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) #define _mm_maskz_getmant_pd(U, A, B, C) \ - (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U)) + ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) #define _mm256_getmant_pd(A, B, C) \ - (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1) + ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)-1)) #define _mm256_mask_getmant_pd(W, U, A, B, C) \ - (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U)) + ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) #define _mm256_maskz_getmant_pd(U, A, B, C) \ - (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)) + ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U))) #define _mm_getmant_ps(A, B, C) \ - (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ - (int)(((C)<<2) | (B)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1) + ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ + (int)(((C)<<2) | (B)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1)) #define _mm_mask_getmant_ps(W, U, A, B, C) \ - (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ - (int)(((C)<<2) | (B)), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U)) + ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ + (int)(((C)<<2) | (B)), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) #define _mm_maskz_getmant_ps(U, A, B, C) \ - (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ - (int)(((C)<<2) | (B)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)) + ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ + (int)(((C)<<2) | (B)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U))) #define _mm256_getmant_ps(A, B, C) \ - (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1) + ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1)) #define _mm256_mask_getmant_ps(W, U, A, B, C) \ - (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U)) + ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U))) #define _mm256_maskz_getmant_ps(U, A, B, C) \ - (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)) + ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U))) #define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \ - (__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \ - (void const *)(addr), \ - (__v2di)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \ + (void const *)(addr), \ + (__v2di)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \ - (__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \ - (void const *)(addr), \ - (__v2di)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \ + (void const *)(addr), \ + (__v2di)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \ - (__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \ - (void const *)(addr), \ - (__v4di)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \ + (void const *)(addr), \ + (__v4di)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \ - (__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \ - (void const *)(addr), \ - (__v4di)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \ + (void const *)(addr), \ + (__v4di)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \ - (__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \ - (void const *)(addr), \ - (__v2di)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)) - -#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \ - (__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \ + ((__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \ (void const *)(addr), \ (__v2di)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)) + (__mmask8)(mask), (int)(scale))) -#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \ - (__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \ - (void const *)(addr), \ - (__v4di)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)) +#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \ + ((__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \ + (void const *)(addr), \ + (__v2di)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) -#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \ - (__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \ +#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \ + ((__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \ (void const *)(addr), \ (__v4di)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)) + (__mmask8)(mask), (int)(scale))) + +#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \ + ((__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \ + (void const *)(addr), \ + (__v4di)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \ - (__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \ - (void const *)(addr), \ - (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \ + (void const *)(addr), \ + (__v4si)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \ - (__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \ - (void const *)(addr), \ - (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \ + (void const *)(addr), \ + (__v4si)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \ - (__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \ - (void const *)(addr), \ - (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \ + (void const *)(addr), \ + (__v4si)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \ - (__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \ - (void const *)(addr), \ - (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)) + ((__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \ + (void const *)(addr), \ + (__v4si)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \ - (__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \ - (void const *)(addr), \ - (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)) - -#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \ - (__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \ + ((__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \ (void const *)(addr), \ (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)) + (__mmask8)(mask), (int)(scale))) -#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \ - (__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \ - (void const *)(addr), \ - (__v8si)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)) +#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \ + ((__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \ + (void const *)(addr), \ + (__v4si)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) -#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \ - (__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \ +#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \ + ((__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \ (void const *)(addr), \ (__v8si)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)) + (__mmask8)(mask), (int)(scale))) + +#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \ + ((__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \ + (void const *)(addr), \ + (__v8si)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) #define _mm256_permutex_pd(X, C) \ - (__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(C)) + ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(C))) #define _mm256_mask_permutex_pd(W, U, X, C) \ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_permutex_pd((X), (C)), \ - (__v4df)(__m256d)(W)) + (__v4df)(__m256d)(W))) #define _mm256_maskz_permutex_pd(U, X, C) \ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_permutex_pd((X), (C)), \ - (__v4df)_mm256_setzero_pd()) + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_permutex_pd((X), (C)), \ + (__v4df)_mm256_setzero_pd())) #define _mm256_permutex_epi64(X, C) \ - (__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(C)) + ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(C))) #define _mm256_mask_permutex_epi64(W, U, X, C) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm256_permutex_epi64((X), (C)), \ - (__v4di)(__m256i)(W)) + (__v4di)(__m256i)(W))) #define _mm256_maskz_permutex_epi64(U, X, C) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm256_permutex_epi64((X), (C)), \ - (__v4di)_mm256_setzero_si256()) + (__v4di)_mm256_setzero_si256())) static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_permutexvar_pd (__m256i __X, __m256d __Y) @@ -8175,60 +8175,60 @@ _mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) } #define _mm_alignr_epi32(A, B, imm) \ - (__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (int)(imm)) + ((__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (int)(imm))) #define _mm_mask_alignr_epi32(W, U, A, B, imm) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ (__v4si)_mm_alignr_epi32((A), (B), (imm)), \ - (__v4si)(__m128i)(W)) + (__v4si)(__m128i)(W))) #define _mm_maskz_alignr_epi32(U, A, B, imm) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ (__v4si)_mm_alignr_epi32((A), (B), (imm)), \ - (__v4si)_mm_setzero_si128()) + (__v4si)_mm_setzero_si128())) #define _mm256_alignr_epi32(A, B, imm) \ - (__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (int)(imm)) + ((__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), (int)(imm))) #define _mm256_mask_alignr_epi32(W, U, A, B, imm) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ - (__v8si)(__m256i)(W)) + (__v8si)(__m256i)(W))) #define _mm256_maskz_alignr_epi32(U, A, B, imm) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ - (__v8si)_mm256_setzero_si256()) + (__v8si)_mm256_setzero_si256())) #define _mm_alignr_epi64(A, B, imm) \ - (__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (int)(imm)) + ((__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (int)(imm))) #define _mm_mask_alignr_epi64(W, U, A, B, imm) \ - (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ (__v2di)_mm_alignr_epi64((A), (B), (imm)), \ - (__v2di)(__m128i)(W)) + (__v2di)(__m128i)(W))) #define _mm_maskz_alignr_epi64(U, A, B, imm) \ - (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ (__v2di)_mm_alignr_epi64((A), (B), (imm)), \ - (__v2di)_mm_setzero_si128()) + (__v2di)_mm_setzero_si128())) #define _mm256_alignr_epi64(A, B, imm) \ - (__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(imm)) + ((__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(imm))) #define _mm256_mask_alignr_epi64(W, U, A, B, imm) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ - (__v4di)(__m256i)(W)) + (__v4di)(__m256i)(W))) #define _mm256_maskz_alignr_epi64(U, A, B, imm) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ - (__v4di)_mm256_setzero_si256()) + (__v4di)_mm256_setzero_si256())) static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) @@ -8295,24 +8295,24 @@ _mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) } #define _mm256_mask_shuffle_epi32(W, U, A, I) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shuffle_epi32((A), (I)), \ - (__v8si)(__m256i)(W)) + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_epi32((A), (I)), \ + (__v8si)(__m256i)(W))) #define _mm256_maskz_shuffle_epi32(U, A, I) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shuffle_epi32((A), (I)), \ - (__v8si)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_epi32((A), (I)), \ + (__v8si)_mm256_setzero_si256())) #define _mm_mask_shuffle_epi32(W, U, A, I) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shuffle_epi32((A), (I)), \ - (__v4si)(__m128i)(W)) + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shuffle_epi32((A), (I)), \ + (__v4si)(__m128i)(W))) #define _mm_maskz_shuffle_epi32(U, A, I) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shuffle_epi32((A), (I)), \ - (__v4si)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shuffle_epi32((A), (I)), \ + (__v4si)_mm_setzero_si128())) static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A) @@ -8413,27 +8413,27 @@ _mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) } #define _mm_mask_cvt_roundps_ph(W, U, A, I) \ - (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \ - (__v8hi)(__m128i)(W), \ - (__mmask8)(U)) + ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \ + (__v8hi)(__m128i)(W), \ + (__mmask8)(U))) #define _mm_maskz_cvt_roundps_ph(U, A, I) \ - (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \ - (__v8hi)_mm_setzero_si128(), \ - (__mmask8)(U)) + ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \ + (__v8hi)_mm_setzero_si128(), \ + (__mmask8)(U))) #define _mm_mask_cvtps_ph _mm_mask_cvt_roundps_ph #define _mm_maskz_cvtps_ph _mm_maskz_cvt_roundps_ph #define _mm256_mask_cvt_roundps_ph(W, U, A, I) \ - (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ - (__v8hi)(__m128i)(W), \ - (__mmask8)(U)) + ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ + (__v8hi)(__m128i)(W), \ + (__mmask8)(U))) #define _mm256_maskz_cvt_roundps_ph(U, A, I) \ - (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ - (__v8hi)_mm_setzero_si128(), \ - (__mmask8)(U)) + ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ + (__v8hi)_mm_setzero_si128(), \ + (__mmask8)(U))) #define _mm256_mask_cvtps_ph _mm256_mask_cvt_roundps_ph #define _mm256_maskz_cvtps_ph _mm256_maskz_cvt_roundps_ph diff --git a/clang/lib/Headers/avx512vlvbmi2intrin.h b/clang/lib/Headers/avx512vlvbmi2intrin.h index a40f926de75ab..fac1f232415af 100644 --- a/clang/lib/Headers/avx512vlvbmi2intrin.h +++ b/clang/lib/Headers/avx512vlvbmi2intrin.h @@ -239,172 +239,172 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) } #define _mm256_shldi_epi64(A, B, I) \ - (__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(I)) + ((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(I))) #define _mm256_mask_shldi_epi64(S, U, A, B, I) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ - (__v4di)(__m256i)(S)) + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ + (__v4di)(__m256i)(S))) #define _mm256_maskz_shldi_epi64(U, A, B, I) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ - (__v4di)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ + (__v4di)_mm256_setzero_si256())) #define _mm_shldi_epi64(A, B, I) \ - (__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (int)(I)) + ((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (int)(I))) #define _mm_mask_shldi_epi64(S, U, A, B, I) \ - (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_shldi_epi64((A), (B), (I)), \ - (__v2di)(__m128i)(S)) + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_shldi_epi64((A), (B), (I)), \ + (__v2di)(__m128i)(S))) #define _mm_maskz_shldi_epi64(U, A, B, I) \ - (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_shldi_epi64((A), (B), (I)), \ - (__v2di)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_shldi_epi64((A), (B), (I)), \ + (__v2di)_mm_setzero_si128())) #define _mm256_shldi_epi32(A, B, I) \ - (__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (int)(I)) + ((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), (int)(I))) #define _mm256_mask_shldi_epi32(S, U, A, B, I) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ - (__v8si)(__m256i)(S)) + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ + (__v8si)(__m256i)(S))) #define _mm256_maskz_shldi_epi32(U, A, B, I) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ - (__v8si)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ + (__v8si)_mm256_setzero_si256())) #define _mm_shldi_epi32(A, B, I) \ - (__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (int)(I)) + ((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (int)(I))) #define _mm_mask_shldi_epi32(S, U, A, B, I) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shldi_epi32((A), (B), (I)), \ - (__v4si)(__m128i)(S)) + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shldi_epi32((A), (B), (I)), \ + (__v4si)(__m128i)(S))) #define _mm_maskz_shldi_epi32(U, A, B, I) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shldi_epi32((A), (B), (I)), \ - (__v4si)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shldi_epi32((A), (B), (I)), \ + (__v4si)_mm_setzero_si128())) #define _mm256_shldi_epi16(A, B, I) \ - (__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \ - (__v16hi)(__m256i)(B), (int)(I)) + ((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B), (int)(I))) #define _mm256_mask_shldi_epi16(S, U, A, B, I) \ - (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ - (__v16hi)(__m256i)(S)) + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ + (__v16hi)(__m256i)(S))) #define _mm256_maskz_shldi_epi16(U, A, B, I) \ - (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ - (__v16hi)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ + (__v16hi)_mm256_setzero_si256())) #define _mm_shldi_epi16(A, B, I) \ - (__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \ - (__v8hi)(__m128i)(B), (int)(I)) + ((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), (int)(I))) #define _mm_mask_shldi_epi16(S, U, A, B, I) \ - (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ - (__v8hi)(__m128i)(S)) + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ + (__v8hi)(__m128i)(S))) #define _mm_maskz_shldi_epi16(U, A, B, I) \ - (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ - (__v8hi)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ + (__v8hi)_mm_setzero_si128())) #define _mm256_shrdi_epi64(A, B, I) \ - (__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(I)) + ((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(I))) #define _mm256_mask_shrdi_epi64(S, U, A, B, I) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ - (__v4di)(__m256i)(S)) + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ + (__v4di)(__m256i)(S))) #define _mm256_maskz_shrdi_epi64(U, A, B, I) \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ - (__v4di)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ + (__v4di)_mm256_setzero_si256())) #define _mm_shrdi_epi64(A, B, I) \ - (__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (int)(I)) + ((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (int)(I))) #define _mm_mask_shrdi_epi64(S, U, A, B, I) \ - (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ - (__v2di)(__m128i)(S)) + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ + (__v2di)(__m128i)(S))) #define _mm_maskz_shrdi_epi64(U, A, B, I) \ - (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ - (__v2di)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ + (__v2di)_mm_setzero_si128())) #define _mm256_shrdi_epi32(A, B, I) \ - (__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (int)(I)) + ((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), (int)(I))) #define _mm256_mask_shrdi_epi32(S, U, A, B, I) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ - (__v8si)(__m256i)(S)) + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ + (__v8si)(__m256i)(S))) #define _mm256_maskz_shrdi_epi32(U, A, B, I) \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ - (__v8si)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ + (__v8si)_mm256_setzero_si256())) #define _mm_shrdi_epi32(A, B, I) \ - (__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (int)(I)) + ((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (int)(I))) #define _mm_mask_shrdi_epi32(S, U, A, B, I) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ - (__v4si)(__m128i)(S)) + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ + (__v4si)(__m128i)(S))) #define _mm_maskz_shrdi_epi32(U, A, B, I) \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ - (__v4si)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ + (__v4si)_mm_setzero_si128())) #define _mm256_shrdi_epi16(A, B, I) \ - (__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \ - (__v16hi)(__m256i)(B), (int)(I)) + ((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B), (int)(I))) #define _mm256_mask_shrdi_epi16(S, U, A, B, I) \ - (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ - (__v16hi)(__m256i)(S)) + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ + (__v16hi)(__m256i)(S))) #define _mm256_maskz_shrdi_epi16(U, A, B, I) \ - (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ - (__v16hi)_mm256_setzero_si256()) + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ + (__v16hi)_mm256_setzero_si256())) #define _mm_shrdi_epi16(A, B, I) \ - (__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \ - (__v8hi)(__m128i)(B), (int)(I)) + ((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), (int)(I))) #define _mm_mask_shrdi_epi16(S, U, A, B, I) \ - (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ - (__v8hi)(__m128i)(S)) + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ + (__v8hi)(__m128i)(S))) #define _mm_maskz_shrdi_epi16(U, A, B, I) \ - (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ - (__v8hi)_mm_setzero_si128()) + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ + (__v8hi)_mm_setzero_si128())) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h index 71ac1b4370d4f..0fb29af262f71 100644 --- a/clang/lib/Headers/avx512vlvnniintrin.h +++ b/clang/lib/Headers/avx512vlvnniintrin.h @@ -36,7 +36,7 @@ /// DST[MAX:256] := 0 /// \endoperation #define _mm256_dpbusd_epi32(S, A, B) \ - (__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)) + ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed @@ -56,7 +56,7 @@ /// DST[MAX:256] := 0 /// \endoperation #define _mm256_dpbusds_epi32(S, A, B) \ - (__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)) + ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit @@ -74,7 +74,7 @@ /// DST[MAX:256] := 0 /// \endoperation #define _mm256_dpwssd_epi32(S, A, B) \ - (__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)) + ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit @@ -92,7 +92,7 @@ /// DST[MAX:256] := 0 /// \endoperation #define _mm256_dpwssds_epi32(S, A, B) \ - (__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)) + ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed @@ -112,7 +112,7 @@ /// DST[MAX:128] := 0 /// \endoperation #define _mm_dpbusd_epi32(S, A, B) \ - (__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)) + ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed @@ -132,7 +132,7 @@ /// DST[MAX:128] := 0 /// \endoperation #define _mm_dpbusds_epi32(S, A, B) \ - (__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)) + ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit @@ -150,7 +150,7 @@ /// DST[MAX:128] := 0 /// \endoperation #define _mm_dpwssd_epi32(S, A, B) \ - (__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)) + ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit @@ -168,7 +168,7 @@ /// DST[MAX:128] := 0 /// \endoperation #define _mm_dpwssds_epi32(S, A, B) \ - (__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)) + ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) diff --git a/clang/lib/Headers/f16cintrin.h b/clang/lib/Headers/f16cintrin.h index 109b604adae3b..13905e6fb0ec8 100644 --- a/clang/lib/Headers/f16cintrin.h +++ b/clang/lib/Headers/f16cintrin.h @@ -66,8 +66,8 @@ _cvtsh_ss(unsigned short __a) /// 1XX: Use MXCSR.RC for rounding /// \returns The converted 16-bit half-precision float value. #define _cvtss_sh(a, imm) \ - (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \ - (imm)))[0]) + ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \ + (imm)))[0])) /// Converts a 128-bit vector containing 32-bit float values into a /// 128-bit vector containing 16-bit half-precision float values. @@ -93,7 +93,7 @@ _cvtsh_ss(unsigned short __a) /// values. The lower 64 bits are used to store the converted 16-bit /// half-precision floating-point values. #define _mm_cvtps_ph(a, imm) \ - (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)) + ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))) /// Converts a 128-bit vector containing 16-bit half-precision float /// values into a 128-bit vector containing 32-bit float values. @@ -136,7 +136,7 @@ _mm_cvtph_ps(__m128i __a) /// \returns A 128-bit vector containing the converted 16-bit half-precision /// float values. #define _mm256_cvtps_ph(a, imm) \ - (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)) + ((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm))) /// Converts a 128-bit vector containing 16-bit half-precision float /// values into a 256-bit vector of [8 x float]. diff --git a/clang/lib/Headers/gfniintrin.h b/clang/lib/Headers/gfniintrin.h index 11a321b7c919b..a59238b0b1319 100644 --- a/clang/lib/Headers/gfniintrin.h +++ b/clang/lib/Headers/gfniintrin.h @@ -28,14 +28,14 @@ #define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256))) #define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \ - (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), \ - (char)(I)) + ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), \ + (char)(I))) #define _mm_gf2p8affine_epi64_epi8(A, B, I) \ - (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), \ - (char)(I)) + ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), \ + (char)(I))) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_gf2p8mul_epi8(__m128i __A, __m128i __B) @@ -46,14 +46,14 @@ _mm_gf2p8mul_epi8(__m128i __A, __m128i __B) #ifdef __AVXINTRIN_H #define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \ - (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \ - (__v32qi)(__m256i)(B), \ - (char)(I)) + ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), \ + (char)(I))) #define _mm256_gf2p8affine_epi64_epi8(A, B, I) \ - (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \ - (__v32qi)(__m256i)(B), \ - (char)(I)) + ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), \ + (char)(I))) static __inline__ __m256i __DEFAULT_FN_ATTRS_Y _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) @@ -65,32 +65,32 @@ _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) #ifdef __AVX512BWINTRIN_H #define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \ - (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), \ - (char)(I)) + ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), \ + (char)(I))) #define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ - (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ - (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v64qi)(__m512i)(S)) + ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v64qi)(__m512i)(S))) #define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ - (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \ - U, A, B, I) + _mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \ + U, A, B, I) #define _mm512_gf2p8affine_epi64_epi8(A, B, I) \ - (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), \ - (char)(I)) + ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), \ + (char)(I))) #define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ - (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ - (__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I), \ - (__v64qi)(__m512i)(S)) + ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \ + (__v64qi)(__m512i)(S))) #define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ - (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \ - U, A, B, I) + _mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \ + U, A, B, I) static __inline__ __m512i __DEFAULT_FN_ATTRS_Z _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) @@ -117,40 +117,39 @@ _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B) #ifdef __AVX512VLBWINTRIN_H #define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ - (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ - (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v16qi)(__m128i)(S)) + ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v16qi)(__m128i)(S))) #define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ - (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \ - U, A, B, I) + _mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \ + U, A, B, I) #define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ - (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ - (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v32qi)(__m256i)(S)) + ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v32qi)(__m256i)(S))) #define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ - (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \ - U, A, B, I) + _mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \ + U, A, B, I) #define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ - (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ - (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \ - (__v16qi)(__m128i)(S)) + ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \ + (__v16qi)(__m128i)(S))) #define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ - (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \ - U, A, B, I) + _mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I) #define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ - (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ - (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \ - (__v32qi)(__m256i)(S)) + ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \ + (__v32qi)(__m256i)(S))) #define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ - (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \ - U, A, B, I) + _mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \ + U, A, B, I) static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 _mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B) diff --git a/clang/lib/Headers/vpclmulqdqintrin.h b/clang/lib/Headers/vpclmulqdqintrin.h index 44daadb07d57c..485692ea2b5b1 100644 --- a/clang/lib/Headers/vpclmulqdqintrin.h +++ b/clang/lib/Headers/vpclmulqdqintrin.h @@ -15,15 +15,15 @@ #define __VPCLMULQDQINTRIN_H #define _mm256_clmulepi64_epi128(A, B, I) \ - (__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (char)(I)) + ((__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (char)(I))) #ifdef __AVX512FINTRIN_H #define _mm512_clmulepi64_epi128(A, B, I) \ - (__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), \ - (char)(I)) + ((__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (char)(I))) #endif // __AVX512FINTRIN_H #endif /* __VPCLMULQDQINTRIN_H */ diff --git a/clang/lib/Headers/xopintrin.h b/clang/lib/Headers/xopintrin.h index 5cedde41b625f..976cdf4902a40 100644 --- a/clang/lib/Headers/xopintrin.h +++ b/clang/lib/Headers/xopintrin.h @@ -225,16 +225,16 @@ _mm_rot_epi64(__m128i __A, __m128i __B) } #define _mm_roti_epi8(A, N) \ - (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)) + ((__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N))) #define _mm_roti_epi16(A, N) \ - (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)) + ((__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N))) #define _mm_roti_epi32(A, N) \ - (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)) + ((__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N))) #define _mm_roti_epi64(A, N) \ - (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)) + ((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N))) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_shl_epi8(__m128i __A, __m128i __B) @@ -285,36 +285,36 @@ _mm_sha_epi64(__m128i __A, __m128i __B) } #define _mm_com_epu8(A, B, N) \ - (__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (N)) + ((__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (N))) #define _mm_com_epu16(A, B, N) \ - (__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \ - (__v8hi)(__m128i)(B), (N)) + ((__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), (N))) #define _mm_com_epu32(A, B, N) \ - (__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (N)) + ((__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (N))) #define _mm_com_epu64(A, B, N) \ - (__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (N)) + ((__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (N))) #define _mm_com_epi8(A, B, N) \ - (__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (N)) + ((__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (N))) #define _mm_com_epi16(A, B, N) \ - (__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \ - (__v8hi)(__m128i)(B), (N)) + ((__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), (N))) #define _mm_com_epi32(A, B, N) \ - (__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (N)) + ((__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (N))) #define _mm_com_epi64(A, B, N) \ - (__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (N)) + ((__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (N))) #define _MM_PCOMCTRL_LT 0 #define _MM_PCOMCTRL_LE 1 @@ -710,23 +710,23 @@ _mm_comtrue_epi64(__m128i __A, __m128i __B) } #define _mm_permute2_pd(X, Y, C, I) \ - (__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (__v2di)(__m128i)(C), (I)) + ((__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__v2di)(__m128i)(C), (I))) #define _mm256_permute2_pd(X, Y, C, I) \ - (__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \ - (__v4df)(__m256d)(Y), \ - (__v4di)(__m256i)(C), (I)) + ((__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), \ + (__v4di)(__m256i)(C), (I))) #define _mm_permute2_ps(X, Y, C, I) \ - (__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ - (__v4si)(__m128i)(C), (I)) + ((__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ + (__v4si)(__m128i)(C), (I))) #define _mm256_permute2_ps(X, Y, C, I) \ - (__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \ - (__v8sf)(__m256)(Y), \ - (__v8si)(__m256i)(C), (I)) + ((__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), \ + (__v8si)(__m256i)(C), (I))) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_frcz_ss(__m128 __A) From 6934ab5f4097fa775a0d91a2d7187bf5c9c527e6 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Sat, 14 Aug 2021 18:33:03 -0700 Subject: [PATCH 051/700] [sanitizer] Improve VSNPrintf internal diagnostics --- .../sanitizer_common/sanitizer_internal_defs.h | 16 +++++++++------- .../lib/sanitizer_common/sanitizer_printf.cpp | 12 ++++++------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h index 07b303e06a098..de06f42c3f48c 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h @@ -292,14 +292,16 @@ void NORETURN CheckFailed(const char *file, int line, const char *cond, u64 v1, u64 v2); // Check macro -#define RAW_CHECK_MSG(expr, msg) do { \ - if (UNLIKELY(!(expr))) { \ - RawWrite(msg); \ - Die(); \ - } \ -} while (0) +#define RAW_CHECK_MSG(expr, msg, ...) \ + do { \ + if (UNLIKELY(!(expr))) { \ + const char* msgs[] = {msg, __VA_ARGS__}; \ + for (const char* m : msgs) RawWrite(m); \ + Die(); \ + } \ + } while (0) -#define RAW_CHECK(expr) RAW_CHECK_MSG(expr, #expr) +#define RAW_CHECK(expr, ...) RAW_CHECK_MSG(expr, #expr "\n", __VA_ARGS__) #define CHECK_IMPL(c1, op, c2) \ do { \ diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp index 0938aa833753a..79aee8ba62823 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp @@ -129,7 +129,7 @@ int VSNPrintf(char *buff, int buff_length, const char *format, va_list args) { static const char *kPrintfFormatsHelp = "Supported Printf formats: %([0-9]*)?(z|l|ll)?{d,u,x,X}; %p; " - "%[-]([0-9]*)?(\\.\\*)?s; %c\n"; + "%[-]([0-9]*)?(\\.\\*)?s; %c\nProvided format: "; RAW_CHECK(format); RAW_CHECK(buff_length > 0); const char *buff_end = &buff[buff_length - 1]; @@ -191,12 +191,12 @@ int VSNPrintf(char *buff, int buff_length, break; } case 'p': { - RAW_CHECK_MSG(!have_flags, kPrintfFormatsHelp); + RAW_CHECK(!have_flags, kPrintfFormatsHelp, format); result += AppendPointer(&buff, buff_end, va_arg(args, uptr)); break; } case 's': { - RAW_CHECK_MSG(!have_length, kPrintfFormatsHelp); + RAW_CHECK(!have_length, kPrintfFormatsHelp, format); // Only left-justified width is supported. CHECK(!have_width || left_justified); result += AppendString(&buff, buff_end, left_justified ? -width : width, @@ -204,17 +204,17 @@ int VSNPrintf(char *buff, int buff_length, break; } case 'c': { - RAW_CHECK_MSG(!have_flags, kPrintfFormatsHelp); + RAW_CHECK(!have_flags, kPrintfFormatsHelp, format); result += AppendChar(&buff, buff_end, va_arg(args, int)); break; } case '%' : { - RAW_CHECK_MSG(!have_flags, kPrintfFormatsHelp); + RAW_CHECK(!have_flags, kPrintfFormatsHelp, format); result += AppendChar(&buff, buff_end, '%'); break; } default: { - RAW_CHECK_MSG(false, kPrintfFormatsHelp); + RAW_CHECK(false, kPrintfFormatsHelp, format); } } } From db0af393f967c550a9fade1472d1513b97002549 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Sat, 14 Aug 2021 18:42:05 -0700 Subject: [PATCH 052/700] [msan] Fix ppc64 format warning --- compiler-rt/lib/msan/msan_linux.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/msan/msan_linux.cpp b/compiler-rt/lib/msan/msan_linux.cpp index c27e66661d222..bced00ba24282 100644 --- a/compiler-rt/lib/msan/msan_linux.cpp +++ b/compiler-rt/lib/msan/msan_linux.cpp @@ -106,7 +106,7 @@ static void CheckMemoryLayoutSanity() { bool InitShadow(bool init_origins) { // Let user know mapping parameters first. - VPrintf(1, "__msan_init %p\n", &__msan_init); + VPrintf(1, "__msan_init %p\n", reinterpret_cast(&__msan_init)); for (unsigned i = 0; i < kMemoryLayoutSize; ++i) VPrintf(1, "%s: %zx - %zx\n", kMemoryLayout[i].name, kMemoryLayout[i].start, kMemoryLayout[i].end - 1); @@ -115,7 +115,7 @@ bool InitShadow(bool init_origins) { if (!MEM_IS_APP(&__msan_init)) { Printf("FATAL: Code %p is out of application range. Non-PIE build?\n", - &__msan_init); + reinterpret_cast(&__msan_init)); return false; } From 530aa7e4da14fb22493ab7e175f8c34dd10333d3 Mon Sep 17 00:00:00 2001 From: Itay Bookstein Date: Sat, 14 Aug 2021 22:01:10 -0700 Subject: [PATCH 053/700] [Linker] Import GlobalIFunc when importing symbols from another module Reviewed By: MaskRay Differential Revision: https://reviews.llvm.org/D107988 --- llvm/lib/Linker/LinkModules.cpp | 4 ++++ llvm/test/Linker/ifunc.ll | 25 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 llvm/test/Linker/ifunc.ll diff --git a/llvm/lib/Linker/LinkModules.cpp b/llvm/lib/Linker/LinkModules.cpp index 97d6f8cd80755..971d3f0b21683 100644 --- a/llvm/lib/Linker/LinkModules.cpp +++ b/llvm/lib/Linker/LinkModules.cpp @@ -526,6 +526,10 @@ bool ModuleLinker::run() { if (linkIfNeeded(GA)) return true; + for (GlobalIFunc &GI : SrcM->ifuncs()) + if (linkIfNeeded(GI)) + return true; + for (unsigned I = 0; I < ValuesToLink.size(); ++I) { GlobalValue *GV = ValuesToLink[I]; const Comdat *SC = GV->getComdat(); diff --git a/llvm/test/Linker/ifunc.ll b/llvm/test/Linker/ifunc.ll new file mode 100644 index 0000000000000..1e5396ed5fed6 --- /dev/null +++ b/llvm/test/Linker/ifunc.ll @@ -0,0 +1,25 @@ +; RUN: split-file %s %t +; RUN: llvm-link %t/a.ll %t/b.ll -S -o - | FileCheck %s + +;; Check that ifuncs are linked in properly. + +; CHECK-DAG: @foo = ifunc void (), bitcast (void ()* ()* @foo_resolve to void ()*) +; CHECK-DAG: define internal void ()* @foo_resolve() { + +; CHECK-DAG: @bar = ifunc void (), bitcast (void ()* ()* @bar_resolve to void ()*) +; CHECK-DAG: define internal void ()* @bar_resolve() { + +;--- a.ll +declare void @bar() + +;--- b.ll +@foo = ifunc void (), bitcast (void ()* ()* @foo_resolve to void ()*) +@bar = ifunc void (), bitcast (void ()* ()* @bar_resolve to void ()*) + +define internal void ()* @foo_resolve() { + ret void ()* null +} + +define internal void ()* @bar_resolve() { + ret void ()* null +} From 435785214f73ff0c92e97f2ade6356e3ba3bf661 Mon Sep 17 00:00:00 2001 From: Anshil Gandhi Date: Sat, 14 Aug 2021 23:37:15 -0600 Subject: [PATCH 054/700] [Remarks] Emit optimization remarks for atomics generating CAS loop Implements ORE in AtomicExpand pass to report atomics generating a compare and swap loop. Differential Revision: https://reviews.llvm.org/D106891 --- .../CodeGenCUDA/atomics-remarks-gfx90a.cu | 16 +++ .../CodeGenOpenCL/atomics-remarks-gfx90a.cl | 46 ++++++++ llvm/lib/CodeGen/AtomicExpandPass.cpp | 23 +++- llvm/test/CodeGen/AArch64/O0-pipeline.ll | 7 +- llvm/test/CodeGen/AArch64/O3-pipeline.ll | 7 +- .../CodeGen/AMDGPU/atomics-remarks-gfx90a.ll | 103 ++++++++++++++++++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 25 +++++ llvm/test/CodeGen/ARM/O3-pipeline.ll | 5 + llvm/test/CodeGen/PowerPC/O3-pipeline.ll | 10 +- llvm/test/CodeGen/X86/O0-pipeline.ll | 7 +- llvm/test/CodeGen/X86/opt-pipeline.ll | 7 +- 11 files changed, 249 insertions(+), 7 deletions(-) create mode 100644 clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu create mode 100644 clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl create mode 100644 llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll diff --git a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu new file mode 100644 index 0000000000000..96892286fd75e --- /dev/null +++ b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -target-cpu gfx90a -Rpass=atomic-expand -S -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=GFX90A-CAS + +// REQUIRES: amdgpu-registered-target + +#include "Inputs/cuda.h" +#include + +// GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope +// GFX90A-CAS-LABEL: _Z14atomic_add_casPf +// GFX90A-CAS: flat_atomic_cmpswap v0, v[2:3], v[4:5] glc +// GFX90A-CAS: s_cbranch_execnz +__device__ float atomic_add_cas(float *p) { + return __atomic_fetch_add(p, 1.0f, memory_order_relaxed); +} diff --git a/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl new file mode 100644 index 0000000000000..2d8b68f83b9d6 --- /dev/null +++ b/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl @@ -0,0 +1,46 @@ +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -Rpass=atomic-expand -S -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=REMARK + +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -Rpass=atomic-expand -S -emit-llvm -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=GFX90A-CAS + +// REQUIRES: amdgpu-registered-target + +typedef enum memory_order { + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST +} memory_order; + +typedef enum memory_scope { + memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, + memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, + memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, + memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, +#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) + memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP +#endif +} memory_scope; + +// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope [-Rpass=atomic-expand] +// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope [-Rpass=atomic-expand] +// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope [-Rpass=atomic-expand] +// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope [-Rpass=atomic-expand] +// GFX90A-CAS-LABEL: @atomic_cas +// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("workgroup-one-as") monotonic +// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("agent-one-as") monotonic +// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("one-as") monotonic +// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("wavefront-one-as") monotonic +float atomic_cas(__global atomic_float *d, float a) { + float ret1 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group); + float ret2 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_device); + float ret3 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_all_svm_devices); + float ret4 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_sub_group); +} + + + diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 125a3be585cb5..5b5458e1058e8 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/AtomicExpandUtils.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/TargetLowering.h" @@ -58,6 +59,7 @@ namespace { class AtomicExpand: public FunctionPass { const TargetLowering *TLI = nullptr; + OptimizationRemarkEmitter *ORE; public: static char ID; // Pass identification, replacement for typeid @@ -69,6 +71,7 @@ namespace { bool runOnFunction(Function &F) override; private: + void getAnalysisUsage(AnalysisUsage &AU) const override; bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); @@ -165,11 +168,16 @@ static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; } +void AtomicExpand::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); +} + bool AtomicExpand::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; + ORE = &getAnalysis().getORE(); auto &TM = TPC->getTM(); if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) return false; @@ -570,7 +578,9 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, } bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { - switch (TLI->shouldExpandAtomicRMWInIR(AI)) { + LLVMContext &Ctx = AI->getModule()->getContext(); + TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(AI); + switch (Kind) { case TargetLoweringBase::AtomicExpansionKind::None: return false; case TargetLoweringBase::AtomicExpansionKind::LLSC: { @@ -600,6 +610,17 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { expandPartwordAtomicRMW(AI, TargetLoweringBase::AtomicExpansionKind::CmpXChg); } else { + SmallVector SSNs; + Ctx.getSyncScopeNames(SSNs); + auto MemScope = SSNs[AI->getSyncScopeID()].empty() + ? "system" + : SSNs[AI->getSyncScopeID()]; + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Passed", AI->getFunction()) + << "A compare and swap loop was generated for an atomic " + << AI->getOperationName(AI->getOperation()) << " operation at " + << MemScope << " memory scope"; + }); expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); } return true; diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll index c382ad0f783cd..c7243890e5005 100644 --- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -8,13 +8,18 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Assumption Cache Tracker -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Lower Garbage Collection Instructions diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index 8d03f7db45723..50ec6f68907ea 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -8,8 +8,8 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information -; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Profile summary info +; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Create Garbage Collector Module Metadata @@ -17,6 +17,11 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: SVE intrinsics optimizations ; CHECK-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll new file mode 100644 index 0000000000000..240963cfe9009 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll @@ -0,0 +1,103 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=atomic-expand \ +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-CAS + +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread-one-as memory scope + +; GFX90A-CAS-LABEL: atomic_add_cas: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_agent: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_agent(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("agent") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_workgroup: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_workgroup(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_wavefront: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_wavefront(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_singlethread: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_singlethread(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_agent_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_agent_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("agent-one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_workgroup_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_workgroup_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_wavefront_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_wavefront_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_singlethread_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_singlethread_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread-one-as") monotonic, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 73909dc918f0a..dba871eee99fd 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -44,6 +44,11 @@ ; GCN-O0-NEXT: Lower OpenCL enqueued blocks ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager +; GCN-O0-NEXT: Dominator Tree Construction +; GCN-O0-NEXT: Natural Loop Information +; GCN-O0-NEXT: Lazy Branch Probability Analysis +; GCN-O0-NEXT: Lazy Block Frequency Analysis +; GCN-O0-NEXT: Optimization Remark Emitter ; GCN-O0-NEXT: Expand Atomic instructions ; GCN-O0-NEXT: Lower constant intrinsics ; GCN-O0-NEXT: Remove unreachable blocks from the CFG @@ -180,6 +185,11 @@ ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces +; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Natural Loop Information +; GCN-O1-NEXT: Lazy Branch Probability Analysis +; GCN-O1-NEXT: Lazy Block Frequency Analysis +; GCN-O1-NEXT: Optimization Remark Emitter ; GCN-O1-NEXT: Expand Atomic instructions ; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Dominator Tree Construction @@ -431,6 +441,11 @@ ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces +; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Natural Loop Information +; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis +; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis +; GCN-O1-OPTS-NEXT: Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Expand Atomic instructions ; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -715,6 +730,11 @@ ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Lazy Branch Probability Analysis +; GCN-O2-NEXT: Lazy Block Frequency Analysis +; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Expand Atomic instructions ; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Dominator Tree Construction @@ -1001,6 +1021,11 @@ ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Lazy Branch Probability Analysis +; GCN-O3-NEXT: Lazy Block Frequency Analysis +; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Expand Atomic instructions ; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll index 2a5ba7653da9f..e25f7b31bd2ef 100644 --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -5,6 +5,11 @@ ; CHECK: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll index aee62db60390c..c37646c6ffb07 100644 --- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll +++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll @@ -8,16 +8,21 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Convert i1 constants to i32/i64 if they are returned +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: PPC Lower MASS Entries ; CHECK-NEXT: FunctionPass Manager @@ -206,4 +211,5 @@ define void @f() { ret void -} \ No newline at end of file +} + diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index bf3ae61660757..8f0275706996a 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -10,13 +10,18 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Assumption Cache Tracker -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index c809433a2fff8..a480d901160fc 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -16,15 +16,20 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store From c35e4dc8dd0b9d4e53633962670d70832fa383c8 Mon Sep 17 00:00:00 2001 From: Rainer Orth Date: Sun, 15 Aug 2021 09:21:08 +0200 Subject: [PATCH 055/700] [asan][test] Un-xfail Posix/unpoison-alternate-stack.cpp on Solaris again `Posix/unpoison-alternate-stack.cpp` currently `XPASS`es on Solaris. The `XFAIL` had already been removed in D97933 , but reintroduced by commit f03d29601e0951da2c88f07d4234128e14e87870 which was never posted or justified. Given the `XPASS`, this obviously wasn't NFC, so I suggest to remove it again. Tested on `amd64-pc-solaris2.11` and `x86_64-pc-linux-gnu`. Differential Revision: https://reviews.llvm.org/D108030 --- .../test/asan/TestCases/Posix/unpoison-alternate-stack.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/compiler-rt/test/asan/TestCases/Posix/unpoison-alternate-stack.cpp b/compiler-rt/test/asan/TestCases/Posix/unpoison-alternate-stack.cpp index a2082ed082154..50d28ddf84c2f 100644 --- a/compiler-rt/test/asan/TestCases/Posix/unpoison-alternate-stack.cpp +++ b/compiler-rt/test/asan/TestCases/Posix/unpoison-alternate-stack.cpp @@ -6,9 +6,6 @@ // RUN: %clangxx_asan -std=c++20 -fexceptions -O0 %s -o %t -pthread // RUN: %run %t -// longjmp from signal handler is unportable. -// XFAIL: solaris - #include #include #include From 265a9961d13e78fc1a5f4b478ac9651fcccdf92b Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Fri, 13 Aug 2021 13:03:21 +0000 Subject: [PATCH 056/700] [flang][nfc] Move `Semantics` from `FrontendAction` to `CompilerInstance` `CompilerInstance` is a more appropriate place for a key component of the frontend like `Semantics`. This change opens a path for us to introduce new frontend actions that will also run semantics, but for which inheriting from `PrescanAndSemaAction` wouldn't make much sense. For example, for code-gen actions we plan to introduce a dedicate hierarchy of action classes. I've also added a doxyment for `CompilerInstance` to add a bit of context for this change (and also make future refactoring more informed). As `CompilerInstance` in Flang has been inspired by its counterpart in Clang, this comment is roughly a verbatim copy of the comment in Clang (with some adjustments from me). Credits to Daniel Dunbar for the great design and the original comment. Differential Revision: https://reviews.llvm.org/D108035 --- .../include/flang/Frontend/CompilerInstance.h | 24 +++++++++++ .../include/flang/Frontend/FrontendActions.h | 9 ---- flang/lib/Frontend/FrontendActions.cpp | 43 +++++++++++-------- 3 files changed, 49 insertions(+), 27 deletions(-) diff --git a/flang/include/flang/Frontend/CompilerInstance.h b/flang/include/flang/Frontend/CompilerInstance.h index 956fe144ac9ef..557c626ad9385 100644 --- a/flang/include/flang/Frontend/CompilerInstance.h +++ b/flang/include/flang/Frontend/CompilerInstance.h @@ -18,6 +18,21 @@ namespace Fortran::frontend { +/// Helper class for managing a single instance of the Flang compiler. +/// +/// This class serves two purposes: +/// (1) It manages the various objects which are necessary to run the compiler +/// (2) It provides utility routines for constructing and manipulating the +/// common Flang objects. +/// +/// The compiler instance generally owns the instance of all the objects that it +/// manages. However, clients can still share objects by manually setting the +/// object and retaking ownership prior to destroying the CompilerInstance. +/// +/// The compiler instance is intended to simplify clients, but not to lock them +/// in to the compiler instance for everything. When possible, utility functions +/// come in two forms; a short form that reuses the CompilerInstance objects, +/// and a long form that takes explicit instances of any required objects. class CompilerInstance { /// The options used in this compiler instance. @@ -30,6 +45,8 @@ class CompilerInstance { std::shared_ptr parsing_; + std::unique_ptr semantics_; + /// The stream for diagnostics from Semantics llvm::raw_ostream *semaOutputStream_ = &llvm::errs(); @@ -110,6 +127,13 @@ class CompilerInstance { /// Get the current stream for verbose output. llvm::raw_ostream &semaOutputStream() { return *semaOutputStream_; } + Fortran::semantics::Semantics &semantics() { return *semantics_; } + const Fortran::semantics::Semantics &semantics() const { return *semantics_; } + + void setSemantics(std::unique_ptr semantics) { + semantics_ = std::move(semantics); + } + /// } /// @name High-Level Operations /// { diff --git a/flang/include/flang/Frontend/FrontendActions.h b/flang/include/flang/Frontend/FrontendActions.h index d30ae1dbed0ff..9cfaabcf7677e 100644 --- a/flang/include/flang/Frontend/FrontendActions.h +++ b/flang/include/flang/Frontend/FrontendActions.h @@ -90,18 +90,9 @@ class DebugDumpParseTreeNoSemaAction : public PrescanAndParseAction { // PrescanAndSema Actions //===----------------------------------------------------------------------===// class PrescanAndSemaAction : public FrontendAction { - std::unique_ptr semantics_; void ExecuteAction() override = 0; bool BeginSourceFileAction(CompilerInstance &ci) override; - -public: - Fortran::semantics::Semantics &semantics() { return *semantics_; } - const Fortran::semantics::Semantics &semantics() const { return *semantics_; } - - void setSemantics(std::unique_ptr semantics) { - semantics_ = std::move(semantics); - } }; class DebugUnparseWithSymbolsAction : public PrescanAndSemaAction { diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index c12cafc02dbf3..acd6b049dfe8d 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -139,10 +139,10 @@ bool PrescanAndSemaAction::BeginSourceFileAction(CompilerInstance &c1) { auto &parseTree{*ci.parsing().parseTree()}; // Prepare semantics - setSemantics(std::make_unique( + ci.setSemantics(std::make_unique( ci.invocation().semanticsContext(), parseTree, ci.invocation().debugModuleDir())); - auto &semantics = this->semantics(); + auto &semantics = ci.semantics(); // Run semantic checks semantics.Perform(); @@ -224,8 +224,10 @@ void DebugDumpProvenanceAction::ExecuteAction() { } void ParseSyntaxOnlyAction::ExecuteAction() { - reportFatalSemanticErrors(semantics(), this->instance().diagnostics(), - GetCurrentFileOrBufferName()); + CompilerInstance &ci = this->instance(); + + reportFatalSemanticErrors( + ci.semantics(), ci.diagnostics(), GetCurrentFileOrBufferName()); } void DebugUnparseNoSemaAction::ExecuteAction() { @@ -256,24 +258,25 @@ void DebugUnparseAction::ExecuteAction() { invoc.useAnalyzedObjectsForUnparse() ? &invoc.asFortran() : nullptr); // Report fatal semantic errors - reportFatalSemanticErrors(semantics(), this->instance().diagnostics(), + reportFatalSemanticErrors(ci.semantics(), this->instance().diagnostics(), GetCurrentFileOrBufferName()); } void DebugUnparseWithSymbolsAction::ExecuteAction() { + CompilerInstance &ci = this->instance(); auto &parseTree{*instance().parsing().parseTree()}; Fortran::semantics::UnparseWithSymbols( llvm::outs(), parseTree, /*encoding=*/Fortran::parser::Encoding::UTF_8); // Report fatal semantic errors - reportFatalSemanticErrors(semantics(), this->instance().diagnostics(), - GetCurrentFileOrBufferName()); + reportFatalSemanticErrors( + ci.semantics(), ci.diagnostics(), GetCurrentFileOrBufferName()); } void DebugDumpSymbolsAction::ExecuteAction() { CompilerInstance &ci = this->instance(); - auto &semantics = this->semantics(); + auto &semantics = ci.semantics(); auto tables{Fortran::semantics::BuildRuntimeDerivedTypeTables( instance().invocation().semanticsContext())}; @@ -306,7 +309,7 @@ void DebugDumpAllAction::ExecuteAction() { Fortran::parser::DumpTree( llvm::outs(), parseTree, &ci.invocation().asFortran()); - auto &semantics = this->semantics(); + auto &semantics = ci.semantics(); auto tables{Fortran::semantics::BuildRuntimeDerivedTypeTables( instance().invocation().semanticsContext())}; // The runtime derived type information table builder may find and report @@ -339,6 +342,7 @@ void DebugDumpParseTreeNoSemaAction::ExecuteAction() { } void DebugDumpParseTreeAction::ExecuteAction() { + CompilerInstance &ci = this->instance(); auto &parseTree{instance().parsing().parseTree()}; // Dump parse tree @@ -346,8 +350,8 @@ void DebugDumpParseTreeAction::ExecuteAction() { llvm::outs(), parseTree, &this->instance().invocation().asFortran()); // Report fatal semantic errors - reportFatalSemanticErrors(semantics(), this->instance().diagnostics(), - GetCurrentFileOrBufferName()); + reportFatalSemanticErrors( + ci.semantics(), ci.diagnostics(), GetCurrentFileOrBufferName()); } void DebugMeasureParseTreeAction::ExecuteAction() { @@ -385,7 +389,7 @@ void DebugPreFIRTreeAction::ExecuteAction() { CompilerInstance &ci = this->instance(); // Report and exit if fatal semantic errors are present if (reportFatalSemanticErrors( - semantics(), ci.diagnostics(), GetCurrentFileOrBufferName())) { + ci.semantics(), ci.diagnostics(), GetCurrentFileOrBufferName())) { return; } @@ -410,12 +414,13 @@ void DebugDumpParsingLogAction::ExecuteAction() { } void GetDefinitionAction::ExecuteAction() { + CompilerInstance &ci = this->instance(); + // Report and exit if fatal semantic errors are present - if (reportFatalSemanticErrors(semantics(), this->instance().diagnostics(), - GetCurrentFileOrBufferName())) + if (reportFatalSemanticErrors( + ci.semantics(), ci.diagnostics(), GetCurrentFileOrBufferName())) return; - CompilerInstance &ci = this->instance(); parser::AllCookedSources &cs = ci.allCookedSources(); unsigned diagID = ci.diagnostics().getCustomDiagID( clang::DiagnosticsEngine::Error, "Symbol not found"); @@ -457,12 +462,14 @@ void GetDefinitionAction::ExecuteAction() { } void GetSymbolsSourcesAction::ExecuteAction() { + CompilerInstance &ci = this->instance(); + // Report and exit if fatal semantic errors are present - if (reportFatalSemanticErrors(semantics(), this->instance().diagnostics(), - GetCurrentFileOrBufferName())) + if (reportFatalSemanticErrors( + ci.semantics(), ci.diagnostics(), GetCurrentFileOrBufferName())) return; - semantics().DumpSymbolsSources(llvm::outs()); + ci.semantics().DumpSymbolsSources(llvm::outs()); } void EmitObjAction::ExecuteAction() { From 49de6070a2b7a9bb88ff7c935fea5176b1d9255f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1vid=20Bolvansk=C3=BD?= Date: Sun, 15 Aug 2021 11:44:13 +0200 Subject: [PATCH 057/700] Revert "[Remarks] Emit optimization remarks for atomics generating CAS loop" This reverts commit 435785214f73ff0c92e97f2ade6356e3ba3bf661. Still same compile time issues for -O0 -g, eg. +1.3% for sqlite3. --- .../CodeGenCUDA/atomics-remarks-gfx90a.cu | 16 --- .../CodeGenOpenCL/atomics-remarks-gfx90a.cl | 46 -------- llvm/lib/CodeGen/AtomicExpandPass.cpp | 23 +--- llvm/test/CodeGen/AArch64/O0-pipeline.ll | 7 +- llvm/test/CodeGen/AArch64/O3-pipeline.ll | 7 +- .../CodeGen/AMDGPU/atomics-remarks-gfx90a.ll | 103 ------------------ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 25 ----- llvm/test/CodeGen/ARM/O3-pipeline.ll | 5 - llvm/test/CodeGen/PowerPC/O3-pipeline.ll | 10 +- llvm/test/CodeGen/X86/O0-pipeline.ll | 7 +- llvm/test/CodeGen/X86/opt-pipeline.ll | 7 +- 11 files changed, 7 insertions(+), 249 deletions(-) delete mode 100644 clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu delete mode 100644 clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl delete mode 100644 llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll diff --git a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu deleted file mode 100644 index 96892286fd75e..0000000000000 --- a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu +++ /dev/null @@ -1,16 +0,0 @@ -// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -fcuda-is-device \ -// RUN: -target-cpu gfx90a -Rpass=atomic-expand -S -o - 2>&1 | \ -// RUN: FileCheck %s --check-prefix=GFX90A-CAS - -// REQUIRES: amdgpu-registered-target - -#include "Inputs/cuda.h" -#include - -// GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope -// GFX90A-CAS-LABEL: _Z14atomic_add_casPf -// GFX90A-CAS: flat_atomic_cmpswap v0, v[2:3], v[4:5] glc -// GFX90A-CAS: s_cbranch_execnz -__device__ float atomic_add_cas(float *p) { - return __atomic_fetch_add(p, 1.0f, memory_order_relaxed); -} diff --git a/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl deleted file mode 100644 index 2d8b68f83b9d6..0000000000000 --- a/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl +++ /dev/null @@ -1,46 +0,0 @@ -// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ -// RUN: -Rpass=atomic-expand -S -o - 2>&1 | \ -// RUN: FileCheck %s --check-prefix=REMARK - -// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ -// RUN: -Rpass=atomic-expand -S -emit-llvm -o - 2>&1 | \ -// RUN: FileCheck %s --check-prefix=GFX90A-CAS - -// REQUIRES: amdgpu-registered-target - -typedef enum memory_order { - memory_order_relaxed = __ATOMIC_RELAXED, - memory_order_acquire = __ATOMIC_ACQUIRE, - memory_order_release = __ATOMIC_RELEASE, - memory_order_acq_rel = __ATOMIC_ACQ_REL, - memory_order_seq_cst = __ATOMIC_SEQ_CST -} memory_order; - -typedef enum memory_scope { - memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, - memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, - memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, - memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, -#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) - memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP -#endif -} memory_scope; - -// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope [-Rpass=atomic-expand] -// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope [-Rpass=atomic-expand] -// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope [-Rpass=atomic-expand] -// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope [-Rpass=atomic-expand] -// GFX90A-CAS-LABEL: @atomic_cas -// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("workgroup-one-as") monotonic -// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("agent-one-as") monotonic -// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("one-as") monotonic -// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("wavefront-one-as") monotonic -float atomic_cas(__global atomic_float *d, float a) { - float ret1 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group); - float ret2 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_device); - float ret3 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_all_svm_devices); - float ret4 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_sub_group); -} - - - diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 5b5458e1058e8..125a3be585cb5 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -17,7 +17,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/AtomicExpandUtils.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/TargetLowering.h" @@ -59,7 +58,6 @@ namespace { class AtomicExpand: public FunctionPass { const TargetLowering *TLI = nullptr; - OptimizationRemarkEmitter *ORE; public: static char ID; // Pass identification, replacement for typeid @@ -71,7 +69,6 @@ namespace { bool runOnFunction(Function &F) override; private: - void getAnalysisUsage(AnalysisUsage &AU) const override; bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); @@ -168,16 +165,11 @@ static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) { Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; } -void AtomicExpand::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); -} - bool AtomicExpand::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; - ORE = &getAnalysis().getORE(); auto &TM = TPC->getTM(); if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) return false; @@ -578,9 +570,7 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, } bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { - LLVMContext &Ctx = AI->getModule()->getContext(); - TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(AI); - switch (Kind) { + switch (TLI->shouldExpandAtomicRMWInIR(AI)) { case TargetLoweringBase::AtomicExpansionKind::None: return false; case TargetLoweringBase::AtomicExpansionKind::LLSC: { @@ -610,17 +600,6 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { expandPartwordAtomicRMW(AI, TargetLoweringBase::AtomicExpansionKind::CmpXChg); } else { - SmallVector SSNs; - Ctx.getSyncScopeNames(SSNs); - auto MemScope = SSNs[AI->getSyncScopeID()].empty() - ? "system" - : SSNs[AI->getSyncScopeID()]; - ORE->emit([&]() { - return OptimizationRemark(DEBUG_TYPE, "Passed", AI->getFunction()) - << "A compare and swap loop was generated for an atomic " - << AI->getOperationName(AI->getOperation()) << " operation at " - << MemScope << " memory scope"; - }); expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); } return true; diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll index c7243890e5005..c382ad0f783cd 100644 --- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -8,18 +8,13 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Assumption Cache Tracker +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Lower Garbage Collection Instructions diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index 50ec6f68907ea..8d03f7db45723 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -8,8 +8,8 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Assumption Cache Tracker +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Create Garbage Collector Module Metadata @@ -17,11 +17,6 @@ ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: SVE intrinsics optimizations ; CHECK-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll deleted file mode 100644 index 240963cfe9009..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll +++ /dev/null @@ -1,103 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=atomic-expand \ -; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-CAS - -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope -; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread-one-as memory scope - -; GFX90A-CAS-LABEL: atomic_add_cas: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_agent: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_agent(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("agent") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_workgroup: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_workgroup(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_wavefront: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_wavefront(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_singlethread: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_singlethread(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_one_as: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_one_as(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("one-as") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_agent_one_as: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_agent_one_as(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("agent-one-as") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_workgroup_one_as: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_workgroup_one_as(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup-one-as") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_wavefront_one_as: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_wavefront_one_as(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront-one-as") monotonic, align 4 - ret void -} - -; GFX90A-CAS-LABEL: atomic_add_cas_singlethread_one_as: -; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc -; GFX90A-CAS: s_cbranch_execnz -define dso_local void @atomic_add_cas_singlethread_one_as(float* %p, float %q) { -entry: - %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread-one-as") monotonic, align 4 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index dba871eee99fd..73909dc918f0a 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -44,11 +44,6 @@ ; GCN-O0-NEXT: Lower OpenCL enqueued blocks ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager -; GCN-O0-NEXT: Dominator Tree Construction -; GCN-O0-NEXT: Natural Loop Information -; GCN-O0-NEXT: Lazy Branch Probability Analysis -; GCN-O0-NEXT: Lazy Block Frequency Analysis -; GCN-O0-NEXT: Optimization Remark Emitter ; GCN-O0-NEXT: Expand Atomic instructions ; GCN-O0-NEXT: Lower constant intrinsics ; GCN-O0-NEXT: Remove unreachable blocks from the CFG @@ -185,11 +180,6 @@ ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces -; GCN-O1-NEXT: Dominator Tree Construction -; GCN-O1-NEXT: Natural Loop Information -; GCN-O1-NEXT: Lazy Branch Probability Analysis -; GCN-O1-NEXT: Lazy Block Frequency Analysis -; GCN-O1-NEXT: Optimization Remark Emitter ; GCN-O1-NEXT: Expand Atomic instructions ; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Dominator Tree Construction @@ -441,11 +431,6 @@ ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces -; GCN-O1-OPTS-NEXT: Dominator Tree Construction -; GCN-O1-OPTS-NEXT: Natural Loop Information -; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis -; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis -; GCN-O1-OPTS-NEXT: Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Expand Atomic instructions ; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -730,11 +715,6 @@ ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces -; GCN-O2-NEXT: Dominator Tree Construction -; GCN-O2-NEXT: Natural Loop Information -; GCN-O2-NEXT: Lazy Branch Probability Analysis -; GCN-O2-NEXT: Lazy Block Frequency Analysis -; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Expand Atomic instructions ; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Dominator Tree Construction @@ -1021,11 +1001,6 @@ ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces -; GCN-O3-NEXT: Dominator Tree Construction -; GCN-O3-NEXT: Natural Loop Information -; GCN-O3-NEXT: Lazy Branch Probability Analysis -; GCN-O3-NEXT: Lazy Block Frequency Analysis -; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Expand Atomic instructions ; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll index e25f7b31bd2ef..2a5ba7653da9f 100644 --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -5,11 +5,6 @@ ; CHECK: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Simplify the CFG ; CHECK-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll index c37646c6ffb07..aee62db60390c 100644 --- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll +++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll @@ -8,21 +8,16 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Assumption Cache Tracker ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Convert i1 constants to i32/i64 if they are returned -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: PPC Lower MASS Entries ; CHECK-NEXT: FunctionPass Manager @@ -211,5 +206,4 @@ define void @f() { ret void -} - +} \ No newline at end of file diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index 8f0275706996a..bf3ae61660757 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -10,18 +10,13 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Assumption Cache Tracker +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index a480d901160fc..c809433a2fff8 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -16,20 +16,15 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store From 81b106584f2baf33e09be2362c35c1bf2f6bfe94 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sat, 14 Aug 2021 23:35:27 +0200 Subject: [PATCH 058/700] [AArch64] Fix comparison peephole opt with non-0/1 immediate (PR51476) This is a non-intrusive fix for https://bugs.llvm.org/show_bug.cgi?id=51476 intended for backport to the 13.x release branch. It expands on the current hack by distinguishing between CmpValue of 0, 1 and 2, where 0 and 1 have the obvious meaning and 2 means "anything else". The new optimization from D98564 should only be performed for CmpValue of 0 or 1. For main, I think we should switch the analyzeCompare() and optimizeCompare() APIs to use int64_t instead of int, which is in line with MachineOperand's notion of an immediate, and avoids this problem altogether. Differential Revision: https://reviews.llvm.org/D108076 --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 34 +++++++++------- .../CodeGen/AArch64/csinc-cmp-removal.mir | 39 +++++++++++++++++++ llvm/test/CodeGen/AArch64/pr51476.ll | 35 +++++++++++++++++ 3 files changed, 93 insertions(+), 15 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/pr51476.ll diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 3a0cbbb275b5a..0ec4b5753ee17 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1120,6 +1120,16 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, if (!MI.getOperand(1).isReg()) return false; + auto NormalizeCmpValue = [](int64_t Value) -> int { + // Comparison immediates may be 64-bit, but CmpValue is only an int. + // Normalize to 0/1/2 return value, where 2 indicates any value apart from + // 0 or 1. + // TODO: Switch CmpValue to int64_t in the API to avoid this. + if (Value == 0 || Value == 1) + return Value; + return 2; + }; + switch (MI.getOpcode()) { default: break; @@ -1155,8 +1165,7 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; - // FIXME: In order to convert CmpValue to 0 or 1 - CmpValue = MI.getOperand(2).getImm() != 0; + CmpValue = NormalizeCmpValue(MI.getOperand(2).getImm()); return true; case AArch64::ANDSWri: case AArch64::ANDSXri: @@ -1165,14 +1174,9 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; - // FIXME:The return val type of decodeLogicalImmediate is uint64_t, - // while the type of CmpValue is int. When converting uint64_t to int, - // the high 32 bits of uint64_t will be lost. - // In fact it causes a bug in spec2006-483.xalancbmk - // CmpValue is only used to compare with zero in OptimizeCompareInstr - CmpValue = AArch64_AM::decodeLogicalImmediate( + CmpValue = NormalizeCmpValue(AArch64_AM::decodeLogicalImmediate( MI.getOperand(2).getImm(), - MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0; + MI.getOpcode() == AArch64::ANDSWri ? 32 : 64)); return true; } @@ -1462,10 +1466,9 @@ bool AArch64InstrInfo::optimizeCompareInstr( if (CmpInstr.getOpcode() == AArch64::PTEST_PP) return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); - // Continue only if we have a "ri" where immediate is zero. - // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare - // function. - assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!"); + // Warning: CmpValue == 2 indicates *any* value apart from 0 or 1. + assert((CmpValue == 0 || CmpValue == 1 || CmpValue == 2) && + "CmpValue must be 0, 1, or 2!"); if (SrcReg2 != 0) return false; @@ -1473,9 +1476,10 @@ bool AArch64InstrInfo::optimizeCompareInstr( if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) return false; - if (!CmpValue && substituteCmpToZero(CmpInstr, SrcReg, *MRI)) + if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI)) return true; - return removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI); + return (CmpValue == 0 || CmpValue == 1) && + removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI); } /// Get opcode of S version of Instr. diff --git a/llvm/test/CodeGen/AArch64/csinc-cmp-removal.mir b/llvm/test/CodeGen/AArch64/csinc-cmp-removal.mir index 4222e84b113cc..2098218d23f33 100644 --- a/llvm/test/CodeGen/AArch64/csinc-cmp-removal.mir +++ b/llvm/test/CodeGen/AArch64/csinc-cmp-removal.mir @@ -307,3 +307,42 @@ body: | RET_ReallyLR ... +--- +name: subswr_wrong_cmp_value +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: subswr_wrong_cmp_value + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: $x1 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK: [[DEF:%[0-9]+]]:gpr64 = IMPLICIT_DEF + ; CHECK: [[SUBSXrr:%[0-9]+]]:gpr64 = SUBSXrr killed [[DEF]], [[COPY]], implicit-def $nzcv + ; CHECK: [[CSINCWr:%[0-9]+]]:gpr32common = CSINCWr $wzr, $wzr, 1, implicit $nzcv + ; CHECK: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri killed [[CSINCWr]], 3, 0, implicit-def $nzcv + ; CHECK: Bcc 1, %bb.2, implicit $nzcv + ; CHECK: B %bb.1 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: B %bb.2 + ; CHECK: bb.2: + ; CHECK: RET_ReallyLR + bb.0: + liveins: $x1 + successors: %bb.1(0x40000000), %bb.2(0x40000000) + %1:gpr64common = COPY $x1 + %2:gpr64 = IMPLICIT_DEF + %3:gpr64 = SUBSXrr killed %2:gpr64, %1:gpr64common, implicit-def $nzcv + %4:gpr32common = CSINCWr $wzr, $wzr, 1, implicit $nzcv + %5:gpr32 = SUBSWri killed %4:gpr32common, 3, 0, implicit-def $nzcv + Bcc 1, %bb.2, implicit $nzcv + B %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + B %bb.2 + + bb.2: + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/pr51476.ll b/llvm/test/CodeGen/AArch64/pr51476.ll new file mode 100644 index 0000000000000..6abd41a121546 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/pr51476.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +define void @test(i8 %arg) nounwind { +; CHECK-LABEL: test: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: cmp w0, #3 +; CHECK-NEXT: strb w0, [sp, #12] +; CHECK-NEXT: b.eq .LBB0_2 +; CHECK-NEXT: // %bb.1: // %do_call +; CHECK-NEXT: bl unknown +; CHECK-NEXT: .LBB0_2: // %common.ret +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %tmp = alloca i8 + %cmp1 = icmp ne i8 %arg, 1 + %zext = zext i1 %cmp1 to i8 + store i8 %zext, i8* %tmp + %zext2 = load i8, i8* %tmp + %cmp2 = icmp eq i8 %zext2, 3 + br i1 %cmp2, label %exit, label %do_call + +do_call: + call void @unknown(i8 %zext2) + ret void + +exit: + ret void +} + +declare void @unknown(i8) From f7a831daa6742cd67d24d8190093239ae4ef8774 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Fri, 13 Aug 2021 12:47:51 +0100 Subject: [PATCH 059/700] [LoopVectorize] Don't emit remarks about lack of scalable vectors unless they're specifically requested. Previously we emitted a "does not support scalable vectors" remark for all targets whenever vectorisation is attempted. This pollutes the output for architectures that don't support scalable vectors and is likely confusing to the user. Instead this patch introduces a debug message that reports when scalable vectorisation is allowed by the target and only issues the previous remark when scalable vectorisation is specifically requested, for example: #pragma clang loop vectorize_width(2, scalable) Differential Revision: https://reviews.llvm.org/D108028 --- .../Transforms/Vectorize/LoopVectorize.cpp | 46 ++++++++++++------- .../LoopVectorize/AArch64/scalable-vf-hint.ll | 32 +++++++------ .../LoopVectorize/scalable-vf-hint.ll | 4 +- 3 files changed, 50 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 742a41dc47c73..aac382af50c2b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5591,13 +5591,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() { ElementCount LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { - if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { - reportVectorizationInfo( - "Disabling scalable vectorization, because target does not " - "support scalable vectors.", - "ScalableVectorsUnsupported", ORE, TheLoop); + if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) return ElementCount::getScalable(0); - } if (Hints->isScalableVectorizationDisabled()) { reportVectorizationInfo("Scalable vectorization is explicitly disabled", @@ -5605,6 +5600,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { return ElementCount::getScalable(0); } + LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n"); + auto MaxScalableVF = ElementCount::getScalable( std::numeric_limits::max()); @@ -5707,17 +5704,32 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, return MaxSafeFixedVF; } - LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF - << " is unsafe. Ignoring scalable UserVF.\n"); - ORE->emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", - TheLoop->getStartLoc(), - TheLoop->getHeader()) - << "User-specified vectorization factor " - << ore::NV("UserVectorizationFactor", UserVF) - << " is unsafe. Ignoring the hint to let the compiler pick a " - "suitable VF."; - }); + if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is ignored because scalable vectors are not " + "available.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is ignored because the target does not support scalable " + "vectors. The compiler will pick a more suitable value."; + }); + } else { + LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF + << " is unsafe. Ignoring scalable UserVF.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "User-specified vectorization factor " + << ore::NV("UserVectorizationFactor", UserVF) + << " is unsafe. Ignoring the hint to let the compiler pick a " + "more suitable value."; + }); + } } LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll index 2e600d461e899..a04b3c759e9b0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -38,6 +38,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" ; fixed-width vectorization is used instead. ; CHECK-DBG: LV: Checking a loop in "test1" +; CHECK-DBG: LV: Scalable vectorization is available ; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. ; CHECK-DBG: remark: :0:0: Max legal vector width too small, scalable vectorization unfeasible. ; CHECK-DBG: LV: The max safe fixed VF is: 8. @@ -82,6 +83,7 @@ exit: ; } ; CHECK-DBG: LV: Checking a loop in "test2" +; CHECK-DBG: LV: Scalable vectorization is available ; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. ; CHECK-DBG: LV: The max safe fixed VF is: 4. ; CHECK-DBG: LV: User VF=vscale x 8 is unsafe. Ignoring scalable UserVF. @@ -131,6 +133,7 @@ exit: ; Max fixed VF=32, Max scalable VF=2, safe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test3" +; CHECK-DBG: LV: Scalable vectorization is available ; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2. ; CHECK-DBG: LV: Using user VF vscale x 2. ; CHECK-LABEL: @test3 @@ -179,9 +182,10 @@ exit: ; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test4" +; CHECK-DBG: LV: Scalable vectorization is available ; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2. ; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. -; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a suitable VF. +; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a more suitable value. ; CHECK-DBG: Found feasible scalable VF = vscale x 2 ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test4 @@ -229,6 +233,7 @@ exit: ; Max fixed VF=128, Max scalable VF=8, safe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test5" +; CHECK-DBG: LV: Scalable vectorization is available ; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8. ; CHECK-DBG: LV: Using user VF vscale x 4 ; CHECK-LABEL: @test5 @@ -276,13 +281,14 @@ exit: ; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. ; CHECK-DBG-LABEL: LV: Checking a loop in "test6" +; CHECK-DBG: LV: Scalable vectorization is available ; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8. ; CHECK-DBG: LV: User VF=vscale x 16 is unsafe. Ignoring scalable UserVF. -; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 16 is unsafe. Ignoring the hint to let the compiler pick a suitable VF. +; CHECK-DBG: remark: :0:0: User-specified vectorization factor vscale x 16 is unsafe. Ignoring the hint to let the compiler pick a more suitable value. ; CHECK-DBG: LV: Found feasible scalable VF = vscale x 4 -; CHECK-DBG: Selecting VF: 4. +; CHECK-DBG: Selecting VF: vscale x 4. ; CHECK-LABEL: @test6 -; CHECK: <4 x i32> +; CHECK: define void @test6(i32* %a, i32* %b) { entry: br label %loop @@ -310,9 +316,8 @@ exit: !17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} ; CHECK-NO-SVE-REMARKS-LABEL: LV: Checking a loop in "test_no_sve" -; CHECK-NO-SVE-REMARKS: LV: Disabling scalable vectorization, because target does not support scalable vectors. -; CHECK-NO-SVE-REMARKS: remark: :0:0: Disabling scalable vectorization, because target does not support scalable vectors. -; CHECK-NO-SVE-REMARKS: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. +; CHECK-NO-SVE-REMARKS: LV: User VF=vscale x 4 is ignored because scalable vectors are not available. +; CHECK-NO-SVE-REMARKS: remark: :0:0: User-specified vectorization factor vscale x 4 is ignored because the target does not support scalable vectors. The compiler will pick a more suitable value. ; CHECK-NO-SVE-REMARKS: LV: Selecting VF: 4. ; CHECK-NO-SVE-LABEL: @test_no_sve ; CHECK-NO-SVE: <4 x i32> @@ -344,12 +349,13 @@ exit: ; Test the LV falls back to fixed-width vectorization if scalable vectors are ; supported but max vscale is undefined. ; -; CHECK-NO-SVE-REMARKS-LABEL: LV: Checking a loop in "test_no_max_vscale" -; CHECK-NO-SVE-REMARKS: The max safe fixed VF is: 4. -; CHECK-NO-SVE-REMARKS: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. -; CHECK-NO-SVE-REMARKS: LV: Selecting VF: 4. -; CHECK-NO-SVE-LABEL: @test_no_max_vscale -; CHECK-NO-SVE: <4 x i32> +; CHECK-DBG-LABEL: LV: Checking a loop in "test_no_max_vscale" +; CHECK-DBG: LV: Scalable vectorization is available +; CHECK-DBG: The max safe fixed VF is: 4. +; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF. +; CHECK-DBG: LV: Selecting VF: 4. +; CHECK-LABEL: @test_no_max_vscale +; CHECK: <4 x i32> define void @test_no_max_vscale(i32* %a, i32* %b) { entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll index 6f5a591fa6d0d..ad2571d7a34a8 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll @@ -3,8 +3,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -; CHECK: LV: Disabling scalable vectorization, because target does not support scalable vectors. -; CHECK: remark: :0:0: Disabling scalable vectorization, because target does not support scalable vectors. +; CHECK: LV: User VF=vscale x 4 is ignored because scalable vectors are not available. +; CHECK: remark: :0:0: User-specified vectorization factor vscale x 4 is ignored because the target does not support scalable vectors. The compiler will pick a more suitable value. ; CHECK: LV: The Widest register safe to use is: 32 bits. define void @test1(i32* %a, i32* %b) { entry: From 079ca8e312016bd4fbbd3b185087d4c6246c140b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1vid=20Bolvansk=C3=BD?= Date: Sun, 15 Aug 2021 13:34:24 +0200 Subject: [PATCH 060/700] [Clang] Put -Wbool-operation under -Wall To keep compatibility with GCC. --- clang/include/clang/Basic/DiagnosticGroups.td | 2 ++ clang/include/clang/Basic/DiagnosticSemaKinds.td | 2 +- clang/test/Sema/warn-bitwise-negation-bool.c | 8 ++++---- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 4a67dffb2f7d4..30dadd9731c15 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -64,6 +64,7 @@ def StringConversion : DiagGroup<"string-conversion">; def SignConversion : DiagGroup<"sign-conversion">; def PointerBoolConversion : DiagGroup<"pointer-bool-conversion">; def UndefinedBoolConversion : DiagGroup<"undefined-bool-conversion">; +def BoolOperation : DiagGroup<"bool-operation">; def BoolConversion : DiagGroup<"bool-conversion", [PointerBoolConversion, UndefinedBoolConversion]>; def IntConversion : DiagGroup<"int-conversion">; @@ -944,6 +945,7 @@ def Extra : DiagGroup<"extra", [ ]>; def Most : DiagGroup<"most", [ + BoolOperation, CharSubscript, Comment, DeleteNonVirtualDtor, diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index cbcc0e1f11c48..9eaa696d99913 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -7424,7 +7424,7 @@ def note_member_first_declared_here : Note< def warn_bitwise_negation_bool : Warning< "bitwise negation of a boolean expression%select{;| always evaluates to 'true';}0 " "did you mean logical negation?">, - InGroup>; + InGroup, DefaultIgnore; def err_decrement_bool : Error<"cannot decrement expression of type bool">; def warn_increment_bool : Warning< "incrementing expression of type bool is deprecated and " diff --git a/clang/test/Sema/warn-bitwise-negation-bool.c b/clang/test/Sema/warn-bitwise-negation-bool.c index c74705bc765a2..d9196ca592b26 100644 --- a/clang/test/Sema/warn-bitwise-negation-bool.c +++ b/clang/test/Sema/warn-bitwise-negation-bool.c @@ -1,9 +1,9 @@ // RUN: %clang_cc1 -x c -fsyntax-only -verify -Wbool-operation %s -// RUN: %clang_cc1 -x c -fsyntax-only -verify %s -// RUN: %clang_cc1 -x c -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s +// RUN: %clang_cc1 -x c -fsyntax-only -verify -Wall %s +// RUN: %clang_cc1 -x c -fsyntax-only -Wbool-operation -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s // RUN: %clang_cc1 -x c++ -fsyntax-only -verify -Wbool-operation %s -// RUN: %clang_cc1 -x c++ -fsyntax-only -verify %s -// RUN: %clang_cc1 -x c++ -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s +// RUN: %clang_cc1 -x c++ -fsyntax-only -verify -Wall %s +// RUN: %clang_cc1 -x c++ -fsyntax-only -Wbool-operation -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s #ifdef __cplusplus typedef bool boolean; From ccd7dda8e39adeaf5bbec3c7b68b68800dff7663 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A1vid=20Bolvansk=C3=BD?= Date: Sun, 15 Aug 2021 13:54:58 +0200 Subject: [PATCH 061/700] [Clang] Updated warning-wall.c test file -Wbool-operation was moved to -Wall and test file needs to be adjusted. --- clang/test/Misc/warning-wall.c | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/test/Misc/warning-wall.c b/clang/test/Misc/warning-wall.c index ee3a63e7bbd48..a3686fb96a4ce 100644 --- a/clang/test/Misc/warning-wall.c +++ b/clang/test/Misc/warning-wall.c @@ -3,6 +3,7 @@ RUN: FileCheck --input-file=%t %s CHECK:-Wall CHECK-NEXT: -Wmost +CHECK-NEXT: -Wbool-operation CHECK-NEXT: -Wchar-subscripts CHECK-NEXT: -Wcomment CHECK-NEXT: -Wdelete-non-virtual-dtor From 957334382cd12ec07b46c0ddfdcc220731f6d80f Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Sun, 15 Aug 2021 13:35:53 +0100 Subject: [PATCH 062/700] [ExecutionEngine] Check for libunwind before calling __register_frame libgcc and libunwind have different flavours of __register_frame. Both flavours are already correctly handled, except that the code to handle the libunwind flavour is guarded by __APPLE__. This change uses the presence of __unw_add_dynamic_fde in libunwind instead to detect whether libunwind is used, rather than hardcoding it as Apple vs. non-Apple. Fixes PR44074. Thanks to Albert Jin and Chris Schafmeister for identifying the problem. Reviewed By: lhames Differential Revision: https://reviews.llvm.org/D106129 --- llvm/cmake/config-ix.cmake | 1 + llvm/cmake/unwind.h | 1 + llvm/include/llvm/Config/config.h.cmake | 3 ++ .../Orc/TargetProcess/RegisterEHFrames.cpp | 28 +++++++++++-------- .../RuntimeDyld/RTDyldMemoryManager.cpp | 4 ++- 5 files changed, 24 insertions(+), 13 deletions(-) diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index dd0aaadb47c72..6da2012728ad5 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -211,6 +211,7 @@ endif() # Determine whether we can register EH tables. check_symbol_exists(__register_frame "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_REGISTER_FRAME) check_symbol_exists(__deregister_frame "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_DEREGISTER_FRAME) +check_symbol_exists(__unw_add_dynamic_fde "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_UNW_ADD_DYNAMIC_FDE) check_symbol_exists(_Unwind_Backtrace "unwind.h" HAVE__UNWIND_BACKTRACE) check_symbol_exists(getpagesize unistd.h HAVE_GETPAGESIZE) diff --git a/llvm/cmake/unwind.h b/llvm/cmake/unwind.h index e7f53465f9ce3..52243f2af787b 100644 --- a/llvm/cmake/unwind.h +++ b/llvm/cmake/unwind.h @@ -5,3 +5,4 @@ // exist in the runtime. extern void __register_frame(const void *fde); // NOLINT extern void __deregister_frame(const void *fde); // NOLINT +extern void __unw_add_dynamic_fde(); // NOLINT diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index 8d58ec9d665b3..d7cd44b5db36a 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -64,6 +64,9 @@ /* Define to 1 if we can deregister EH frames on this platform. */ #cmakedefine HAVE_DEREGISTER_FRAME ${HAVE_DEREGISTER_FRAME} +/* Define if __unw_add_dynamic_fde() is available on this platform. */ +#cmakedefine HAVE_UNW_ADD_DYNAMIC_FDE ${HAVE_UNW_ADD_DYNAMIC_FDE} + /* Define to 1 if you have the header file. */ #cmakedefine HAVE_ERRNO_H ${HAVE_ERRNO_H} diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp index db002a34b4445..6f891afabfdc3 100644 --- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp +++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp @@ -85,11 +85,11 @@ static Error deregisterFrameWrapper(const void *P) { } #endif -#ifdef __APPLE__ +#ifdef HAVE_UNW_ADD_DYNAMIC_FDE template -Error walkAppleEHFrameSection(const char *const SectionStart, - size_t SectionSize, HandleFDEFn HandleFDE) { +Error walkLibunwindEHFrameSection(const char *const SectionStart, + size_t SectionSize, HandleFDEFn HandleFDE) { const char *CurCFIRecord = SectionStart; const char *End = SectionStart + SectionSize; uint64_t Size = *reinterpret_cast(CurCFIRecord); @@ -123,16 +123,19 @@ Error walkAppleEHFrameSection(const char *const SectionStart, return Error::success(); } -#endif // __APPLE__ +#endif // HAVE_UNW_ADD_DYNAMIC_FDE Error registerEHFrameSection(const void *EHFrameSectionAddr, size_t EHFrameSectionSize) { -#ifdef __APPLE__ - // On Darwin __register_frame has to be called for each FDE entry. - return walkAppleEHFrameSection(static_cast(EHFrameSectionAddr), - EHFrameSectionSize, registerFrameWrapper); + /* libgcc and libunwind __register_frame behave differently. We use the + * presence of __unw_add_dynamic_fde to detect libunwind. */ +#ifdef HAVE_UNW_ADD_DYNAMIC_FDE + // With libunwind, __register_frame has to be called for each FDE entry. + return walkLibunwindEHFrameSection( + static_cast(EHFrameSectionAddr), EHFrameSectionSize, + registerFrameWrapper); #else - // On Linux __register_frame takes a single argument: + // With libgcc, __register_frame takes a single argument: // a pointer to the start of the .eh_frame section. // How can it find the end? Because crtendS.o is linked @@ -143,9 +146,10 @@ Error registerEHFrameSection(const void *EHFrameSectionAddr, Error deregisterEHFrameSection(const void *EHFrameSectionAddr, size_t EHFrameSectionSize) { -#ifdef __APPLE__ - return walkAppleEHFrameSection(static_cast(EHFrameSectionAddr), - EHFrameSectionSize, deregisterFrameWrapper); +#ifdef HAVE_UNW_ADD_DYNAMIC_FDE + return walkLibunwindEHFrameSection( + static_cast(EHFrameSectionAddr), EHFrameSectionSize, + deregisterFrameWrapper); #else return deregisterFrameWrapper(EHFrameSectionAddr); #endif diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp index b6ccd02405c10..252e20c3c38c9 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp @@ -67,7 +67,9 @@ static void __deregister_frame(void *p) { } #endif -#ifdef __APPLE__ +/* libgcc and libunwind __register_frame behave differently. We use the presence + * of __unw_add_dynamic_fde to detect libunwind. */ +#ifdef HAVE_UNW_ADD_DYNAMIC_FDE static const char *processFDE(const char *Entry, bool isDeregister) { const char *P = Entry; From 3c503ba06af40f0621b722fc6027fbd8c693c254 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sun, 15 Aug 2021 15:46:25 +0200 Subject: [PATCH 063/700] [FunctionImport] Fix build with old mingw (NFC) std::errc::operation_not_supported is not universally supported. Make use of LLVM's errc interoperability header, which lists known-good errc values. --- llvm/lib/Transforms/IPO/FunctionImport.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 2f6cf0ca7087a..4535b75e2c482 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -39,6 +39,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" @@ -496,7 +497,7 @@ static void computeImportForFunction( VI.name().str() + " due to " + getFailureName(Reason); auto Error = make_error( - Msg, std::make_error_code(std::errc::operation_not_supported)); + Msg, make_error_code(errc::not_supported)); logAllUnhandledErrors(std::move(Error), errs(), "Error importing module: "); break; From a240b29f21b0ce2f861e703c7f4c1fcfc40a9fc6 Mon Sep 17 00:00:00 2001 From: Qiu Chaofan Date: Sun, 15 Aug 2021 22:43:46 +0800 Subject: [PATCH 064/700] [NFC] Simply update a FIXME comment X86 overrided LowerOperationWrapper was moved to common implementation in a7eae62. --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index ebe3bfc4b75ac..c29822120921b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -538,8 +538,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { return RecursivelyLegalizeResults(Op, ResultVals); } -// FIXME: This is very similar to the X86 override of -// TargetLowering::LowerOperationWrapper. Can we merge them somehow? +// FIXME: This is very similar to TargetLowering::LowerOperationWrapper. Can we +// merge them somehow? bool VectorLegalizer::LowerOperationWrapper(SDNode *Node, SmallVectorImpl &Results) { SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG); From 944dfa4975e8d55ca9d97f6eb7222ff1d0f7291a Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sun, 15 Aug 2021 16:47:27 +0200 Subject: [PATCH 065/700] [IndVars] Don't check for pointer exit count (NFC) After recent changes, exit counts and BE taken counts are always integers, so convert these to assertions. While here, also convert the loop invariance checks to asserts. Exit counts are always loop invariant. --- llvm/lib/Transforms/Scalar/IndVarSimplify.cpp | 29 +++++++------------ 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 9ee2a2d0bf080..9b665bba4c97f 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1507,12 +1507,9 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { continue; } - // If we end up with a pointer exit count, bail. Note that we can end up - // with a pointer exit count for one exiting block, and not for another in - // the same loop. - if (!ExitCount->getType()->isIntegerTy() || - !MaxExitCount->getType()->isIntegerTy()) - continue; + assert(ExitCount->getType()->isIntegerTy() && + MaxExitCount->getType()->isIntegerTy() && + "Exit counts must be integers"); Type *WiderType = SE->getWiderType(MaxExitCount->getType(), ExitCount->getType()); @@ -1569,14 +1566,11 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { // through *explicit* control flow. We have to eliminate the possibility of // implicit exits (see below) before we know it's truly exact. const SCEV *ExactBTC = SE->getBackedgeTakenCount(L); - if (isa(ExactBTC) || - !SE->isLoopInvariant(ExactBTC, L) || - !isSafeToExpand(ExactBTC, *SE)) + if (isa(ExactBTC) || !isSafeToExpand(ExactBTC, *SE)) return false; - // If we end up with a pointer exit count, bail. It may be unsized. - if (!ExactBTC->getType()->isIntegerTy()) - return false; + assert(SE->isLoopInvariant(ExactBTC, L) && "BTC must be loop invariant"); + assert(ExactBTC->getType()->isIntegerTy() && "BTC must be integer"); auto BadExit = [&](BasicBlock *ExitingBB) { // If our exiting block exits multiple loops, we can only rewrite the @@ -1603,15 +1597,12 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { return true; const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); - if (isa(ExitCount) || - !SE->isLoopInvariant(ExitCount, L) || - !isSafeToExpand(ExitCount, *SE)) - return true; - - // If we end up with a pointer exit count, bail. It may be unsized. - if (!ExitCount->getType()->isIntegerTy()) + if (isa(ExitCount) || !isSafeToExpand(ExitCount, *SE)) return true; + assert(SE->isLoopInvariant(ExitCount, L) && + "Exit count must be loop invariant"); + assert(ExitCount->getType()->isIntegerTy() && "Exit count must be integer"); return false; }; From 35a8bdc775817ce13a6c9b5cf81502052634aa1f Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sun, 15 Aug 2021 18:59:32 +0300 Subject: [PATCH 066/700] [NFCI][IndVars] rewriteLoopExitValues(): nowadays SCEV should not change `GEP` base pointer Currently/previously, while SCEV guaranteed that it produces the same value, the way it was produced may be illegal IR, so we have an ugly check that the replacement is valid. But now that the SCEV strictness wrt the pointer/integer types has been improved, i believe this invariant is already upheld by the SCEV itself, natively. I think we should add an assertion, wait for a week, and then, if all is good, rip out all this checking. Or we could just do the latter directly i guess. This reverts commit rL127839. Reviewed By: nikic Differential Revision: https://reviews.llvm.org/D108043 --- llvm/lib/Transforms/Utils/LoopUtils.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index e4d78f9ada083..51498548856d6 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1351,6 +1351,9 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, // FIXME: isValidRewrite() is a hack. it should be an assert, eventually. Phi.ValidRewrite = isValidRewrite(SE, Phi.ExpansionPoint, Phi.Expansion); + assert(Phi.ValidRewrite && + "Now that the SCEV is strict wrt pointer/integer types, this " + "invariant is expected to be uphold by SCEV itself."); if (!Phi.ValidRewrite) { DeadInsts.push_back(Phi.Expansion); continue; From 77a06a9c33a6731fbe800ffaf8ca398bbe00dcba Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sun, 15 Aug 2021 19:02:32 +0300 Subject: [PATCH 067/700] [NFC][SimplifyCFG] Autogenerate check lines in a test to declutter further update --- .../SimplifyCFG/fold-branch-to-common-dest.ll | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll index 0dae0e8fc107b..2ff0418260771 100644 --- a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll +++ b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll @@ -134,6 +134,7 @@ define void @one_pred_with_spec_call(i8 %v0, i8 %v1, i32* %p) { ; CHECK: final_right: ; CHECK-NEXT: call void @sideeffect0() ; CHECK-NEXT: br label [[COMMON_RET]] +; pred: %c0 = icmp ne i32* %p, null br i1 %c0, label %dispatch, label %final_right @@ -153,10 +154,19 @@ final_right: ; Drop dereferenceable on the parameter define void @one_pred_with_spec_call_deref(i8 %v0, i8 %v1, i32* %p) { -; CHECK-LABEL: one_pred_with_spec_call_deref -; CHECK-LABEL: pred: -; CHECK: %c0 = icmp ne i32* %p, null -; CHECK: %x = call i32 @speculate_call(i32* %p) +; CHECK-LABEL: @one_pred_with_spec_call_deref( +; CHECK-NEXT: pred: +; CHECK-NEXT: [[C0:%.*]] = icmp ne i32* [[P:%.*]], null +; CHECK-NEXT: [[X:%.*]] = call i32 @speculate_call(i32* [[P]]) +; CHECK-NEXT: [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false +; CHECK-NEXT: br i1 [[OR_COND]], label [[COMMON_RET:%.*]], label [[FINAL_RIGHT:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: final_right: +; CHECK-NEXT: call void @sideeffect0() +; CHECK-NEXT: br label [[COMMON_RET]] +; pred: %c0 = icmp ne i32* %p, null br i1 %c0, label %dispatch, label %final_right From 78af5cb213b2f9fe3f47bf23947f14ac07024155 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sun, 15 Aug 2021 19:01:44 +0300 Subject: [PATCH 068/700] [SimplifyCFG] performBranchToCommonDestFolding(): form block-closed SSA form before cloning instructions (PR51125) LLVM IR SSA form is "implicit" in `@pr51125`. While is a valid LLVM IR, and does not require any PHI nodes, that completely breaks the further logic in `CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses()` that updates the live-out uses of the bonus instructions. What i believe we need to do, is to first make the SSA form explicit, by inserting tautological PHI nodes, and rewriting the offending uses. ``` $ /builddirs/llvm-project/build-Clang12/bin/opt -load /repositories/alive2/build-Clang-release/tv/tv.so -load-pass-plugin /repositories/alive2/build-Clang-release/tv/tv.so -tv -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=10 -tv -o /dev/null /tmp/test.ll ---------------------------------------- @global_pr51125 = global 4 bytes, align 4 define i32 @pr51125() { %entry: br label %L %L: %ld = load i32, * @global_pr51125, align 4 %iszero = icmp eq i32 %ld, 0 br i1 %iszero, label %exit, label %L2 %L2: store i32 4294967295, * @global_pr51125, align 4 %cmp = icmp eq i32 %ld, 4294967295 br i1 %cmp, label %L, label %exit %exit: %r = phi i32 [ %ld, %L2 ], [ %ld, %L ] ret i32 %r } => @global_pr51125 = global 4 bytes, align 4 define i32 @pr51125() { %entry: %ld.old = load i32, * @global_pr51125, align 4 %iszero.old = icmp eq i32 %ld.old, 0 br i1 %iszero.old, label %exit, label %L2 %L2: %ld2 = phi i32 [ %ld.old, %entry ], [ %ld, %L2 ] store i32 4294967295, * @global_pr51125, align 4 %cmp = icmp ne i32 %ld2, 4294967295 %ld = load i32, * @global_pr51125, align 4 %iszero = icmp eq i32 %ld, 0 %or.cond = select i1 %cmp, i1 1, i1 %iszero br i1 %or.cond, label %exit, label %L2 %exit: %ld1 = phi i32 [ poison, %L2 ], [ %ld.old, %entry ] %r = phi i32 [ %ld2, %L2 ], [ %ld.old, %entry ] ret i32 %r } Transformation seems to be correct! ``` Fixes https://bugs.llvm.org/show_bug.cgi?id=51125 Reviewed By: nikic Differential Revision: https://reviews.llvm.org/D106317 --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 75 ++++++++++++++++++++--- 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 847fdd760d2fe..68a0388398fc3 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1095,17 +1095,24 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses( // Update (liveout) uses of bonus instructions, // now that the bonus instruction has been cloned into predecessor. - SSAUpdater SSAUpdate; - SSAUpdate.Initialize(BonusInst.getType(), - (NewBonusInst->getName() + ".merge").str()); - SSAUpdate.AddAvailableValue(BB, &BonusInst); - SSAUpdate.AddAvailableValue(PredBlock, NewBonusInst); + // Note that we expect to be in a block-closed SSA form for this to work! for (Use &U : make_early_inc_range(BonusInst.uses())) { auto *UI = cast(U.getUser()); - if (UI->getParent() != PredBlock) - SSAUpdate.RewriteUseAfterInsertions(U); - else // Use is in the same block as, and comes before, NewBonusInst. - SSAUpdate.RewriteUse(U); + auto *PN = dyn_cast(UI); + if (!PN) { + assert(UI->getParent() == BB && BonusInst.comesBefore(UI) && + "If the user is not a PHI node, then it should be in the same " + "block as, and come after, the original bonus instruction."); + continue; // Keep using the original bonus instruction. + } + // Is this the block-closed SSA form PHI node? + if (PN->getIncomingBlock(U) == BB) + continue; // Great, keep using the original bonus instruction. + // The only other alternative is an "use" when coming from + // the predecessor block - here we should refer to the cloned bonus instr. + assert(PN->getIncomingBlock(U) == PredBlock && + "Not in block-closed SSA form?"); + U.set(NewBonusInst); } } } @@ -3032,6 +3039,56 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI, LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB); + // We want to duplicate all the bonus instructions in this block, + // and rewrite their uses, but in some cases with self-loops, + // the naive use rewrite approach won't work (will result in miscompilations). + // To avoid this problem, let's form block-closed SSA form. + for (Instruction &BonusInst : + reverse(iterator_range(*BB))) { + auto IsBCSSAUse = [BB, &BonusInst](Use &U) { + auto *UI = cast(U.getUser()); + if (auto *PN = dyn_cast(UI)) + return PN->getIncomingBlock(U) == BB; + return UI->getParent() == BB && BonusInst.comesBefore(UI); + }; + + // Does this instruction require rewriting of uses? + if (all_of(BonusInst.uses(), IsBCSSAUse)) + continue; + + SSAUpdater SSAUpdate; + Type *Ty = BonusInst.getType(); + SmallVector BCSSAPHIs; + SSAUpdate.Initialize(Ty, BonusInst.getName()); + + // Into each successor block of BB, insert a PHI node, that receives + // the BonusInst when coming from it's basic block, or poison otherwise. + for (BasicBlock *Succ : successors(BB)) { + // The block may have the same successor multiple times. Do it only once. + if (SSAUpdate.HasValueForBlock(Succ)) + continue; + BCSSAPHIs.emplace_back(PHINode::Create( + Ty, 0, BonusInst.getName() + ".bcssa", &Succ->front())); + PHINode *PN = BCSSAPHIs.back(); + for (BasicBlock *PredOfSucc : predecessors(Succ)) + PN->addIncoming(PredOfSucc == BB ? (Value *)&BonusInst + : PoisonValue::get(Ty), + PredOfSucc); + SSAUpdate.AddAvailableValue(Succ, PN); + } + + // And rewrite all uses that break block-closed SSA form. + for (Use &U : make_early_inc_range(BonusInst.uses())) + if (!IsBCSSAUse(U)) + SSAUpdate.RewriteUseAfterInsertions(U); + + // We might not have ended up needing PHI's in all of the succ blocks, + // drop the ones that are certainly unused, but don't bother otherwise. + for (PHINode *PN : BCSSAPHIs) + if (PN->use_empty()) + PN->eraseFromParent(); + } + IRBuilder<> Builder(PBI); // The builder is used to create instructions to eliminate the branch in BB. // If BB's terminator has !annotation metadata, add it to the new From 60dd0121c92e93a33cf39b5f7006923ac9e4f127 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sun, 15 Aug 2021 19:15:09 +0300 Subject: [PATCH 069/700] Revert "[SimplifyCFG] performBranchToCommonDestFolding(): form block-closed SSA form before cloning instructions (PR51125)" Forgot to stage the test change. This reverts commit 78af5cb213b2f9fe3f47bf23947f14ac07024155. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 75 +++-------------------- 1 file changed, 9 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 68a0388398fc3..847fdd760d2fe 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1095,24 +1095,17 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses( // Update (liveout) uses of bonus instructions, // now that the bonus instruction has been cloned into predecessor. - // Note that we expect to be in a block-closed SSA form for this to work! + SSAUpdater SSAUpdate; + SSAUpdate.Initialize(BonusInst.getType(), + (NewBonusInst->getName() + ".merge").str()); + SSAUpdate.AddAvailableValue(BB, &BonusInst); + SSAUpdate.AddAvailableValue(PredBlock, NewBonusInst); for (Use &U : make_early_inc_range(BonusInst.uses())) { auto *UI = cast(U.getUser()); - auto *PN = dyn_cast(UI); - if (!PN) { - assert(UI->getParent() == BB && BonusInst.comesBefore(UI) && - "If the user is not a PHI node, then it should be in the same " - "block as, and come after, the original bonus instruction."); - continue; // Keep using the original bonus instruction. - } - // Is this the block-closed SSA form PHI node? - if (PN->getIncomingBlock(U) == BB) - continue; // Great, keep using the original bonus instruction. - // The only other alternative is an "use" when coming from - // the predecessor block - here we should refer to the cloned bonus instr. - assert(PN->getIncomingBlock(U) == PredBlock && - "Not in block-closed SSA form?"); - U.set(NewBonusInst); + if (UI->getParent() != PredBlock) + SSAUpdate.RewriteUseAfterInsertions(U); + else // Use is in the same block as, and comes before, NewBonusInst. + SSAUpdate.RewriteUse(U); } } } @@ -3039,56 +3032,6 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI, LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB); - // We want to duplicate all the bonus instructions in this block, - // and rewrite their uses, but in some cases with self-loops, - // the naive use rewrite approach won't work (will result in miscompilations). - // To avoid this problem, let's form block-closed SSA form. - for (Instruction &BonusInst : - reverse(iterator_range(*BB))) { - auto IsBCSSAUse = [BB, &BonusInst](Use &U) { - auto *UI = cast(U.getUser()); - if (auto *PN = dyn_cast(UI)) - return PN->getIncomingBlock(U) == BB; - return UI->getParent() == BB && BonusInst.comesBefore(UI); - }; - - // Does this instruction require rewriting of uses? - if (all_of(BonusInst.uses(), IsBCSSAUse)) - continue; - - SSAUpdater SSAUpdate; - Type *Ty = BonusInst.getType(); - SmallVector BCSSAPHIs; - SSAUpdate.Initialize(Ty, BonusInst.getName()); - - // Into each successor block of BB, insert a PHI node, that receives - // the BonusInst when coming from it's basic block, or poison otherwise. - for (BasicBlock *Succ : successors(BB)) { - // The block may have the same successor multiple times. Do it only once. - if (SSAUpdate.HasValueForBlock(Succ)) - continue; - BCSSAPHIs.emplace_back(PHINode::Create( - Ty, 0, BonusInst.getName() + ".bcssa", &Succ->front())); - PHINode *PN = BCSSAPHIs.back(); - for (BasicBlock *PredOfSucc : predecessors(Succ)) - PN->addIncoming(PredOfSucc == BB ? (Value *)&BonusInst - : PoisonValue::get(Ty), - PredOfSucc); - SSAUpdate.AddAvailableValue(Succ, PN); - } - - // And rewrite all uses that break block-closed SSA form. - for (Use &U : make_early_inc_range(BonusInst.uses())) - if (!IsBCSSAUse(U)) - SSAUpdate.RewriteUseAfterInsertions(U); - - // We might not have ended up needing PHI's in all of the succ blocks, - // drop the ones that are certainly unused, but don't bother otherwise. - for (PHINode *PN : BCSSAPHIs) - if (PN->use_empty()) - PN->eraseFromParent(); - } - IRBuilder<> Builder(PBI); // The builder is used to create instructions to eliminate the branch in BB. // If BB's terminator has !annotation metadata, add it to the new From 3d9beefc7d713ad8462d92427ccd17b9532ce904 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Sun, 15 Aug 2021 19:01:44 +0300 Subject: [PATCH 070/700] Reland [SimplifyCFG] performBranchToCommonDestFolding(): form block-closed SSA form before cloning instructions (PR51125) ... with test change this time. LLVM IR SSA form is "implicit" in `@pr51125`. While is a valid LLVM IR, and does not require any PHI nodes, that completely breaks the further logic in `CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses()` that updates the live-out uses of the bonus instructions. What i believe we need to do, is to first make the SSA form explicit, by inserting tautological PHI nodes, and rewriting the offending uses. ``` $ /builddirs/llvm-project/build-Clang12/bin/opt -load /repositories/alive2/build-Clang-release/tv/tv.so -load-pass-plugin /repositories/alive2/build-Clang-release/tv/tv.so -tv -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -bonus-inst-threshold=10 -tv -o /dev/null /tmp/test.ll ---------------------------------------- @global_pr51125 = global 4 bytes, align 4 define i32 @pr51125() { %entry: br label %L %L: %ld = load i32, * @global_pr51125, align 4 %iszero = icmp eq i32 %ld, 0 br i1 %iszero, label %exit, label %L2 %L2: store i32 4294967295, * @global_pr51125, align 4 %cmp = icmp eq i32 %ld, 4294967295 br i1 %cmp, label %L, label %exit %exit: %r = phi i32 [ %ld, %L2 ], [ %ld, %L ] ret i32 %r } => @global_pr51125 = global 4 bytes, align 4 define i32 @pr51125() { %entry: %ld.old = load i32, * @global_pr51125, align 4 %iszero.old = icmp eq i32 %ld.old, 0 br i1 %iszero.old, label %exit, label %L2 %L2: %ld2 = phi i32 [ %ld.old, %entry ], [ %ld, %L2 ] store i32 4294967295, * @global_pr51125, align 4 %cmp = icmp ne i32 %ld2, 4294967295 %ld = load i32, * @global_pr51125, align 4 %iszero = icmp eq i32 %ld, 0 %or.cond = select i1 %cmp, i1 1, i1 %iszero br i1 %or.cond, label %exit, label %L2 %exit: %ld1 = phi i32 [ poison, %L2 ], [ %ld.old, %entry ] %r = phi i32 [ %ld2, %L2 ], [ %ld.old, %entry ] ret i32 %r } Transformation seems to be correct! ``` Fixes https://bugs.llvm.org/show_bug.cgi?id=51125 Reviewed By: nikic Differential Revision: https://reviews.llvm.org/D106317 --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 75 ++++++++++++++++--- .../SimplifyCFG/fold-branch-to-common-dest.ll | 18 ++--- 2 files changed, 75 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 847fdd760d2fe..68a0388398fc3 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1095,17 +1095,24 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses( // Update (liveout) uses of bonus instructions, // now that the bonus instruction has been cloned into predecessor. - SSAUpdater SSAUpdate; - SSAUpdate.Initialize(BonusInst.getType(), - (NewBonusInst->getName() + ".merge").str()); - SSAUpdate.AddAvailableValue(BB, &BonusInst); - SSAUpdate.AddAvailableValue(PredBlock, NewBonusInst); + // Note that we expect to be in a block-closed SSA form for this to work! for (Use &U : make_early_inc_range(BonusInst.uses())) { auto *UI = cast(U.getUser()); - if (UI->getParent() != PredBlock) - SSAUpdate.RewriteUseAfterInsertions(U); - else // Use is in the same block as, and comes before, NewBonusInst. - SSAUpdate.RewriteUse(U); + auto *PN = dyn_cast(UI); + if (!PN) { + assert(UI->getParent() == BB && BonusInst.comesBefore(UI) && + "If the user is not a PHI node, then it should be in the same " + "block as, and come after, the original bonus instruction."); + continue; // Keep using the original bonus instruction. + } + // Is this the block-closed SSA form PHI node? + if (PN->getIncomingBlock(U) == BB) + continue; // Great, keep using the original bonus instruction. + // The only other alternative is an "use" when coming from + // the predecessor block - here we should refer to the cloned bonus instr. + assert(PN->getIncomingBlock(U) == PredBlock && + "Not in block-closed SSA form?"); + U.set(NewBonusInst); } } } @@ -3032,6 +3039,56 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI, LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB); + // We want to duplicate all the bonus instructions in this block, + // and rewrite their uses, but in some cases with self-loops, + // the naive use rewrite approach won't work (will result in miscompilations). + // To avoid this problem, let's form block-closed SSA form. + for (Instruction &BonusInst : + reverse(iterator_range(*BB))) { + auto IsBCSSAUse = [BB, &BonusInst](Use &U) { + auto *UI = cast(U.getUser()); + if (auto *PN = dyn_cast(UI)) + return PN->getIncomingBlock(U) == BB; + return UI->getParent() == BB && BonusInst.comesBefore(UI); + }; + + // Does this instruction require rewriting of uses? + if (all_of(BonusInst.uses(), IsBCSSAUse)) + continue; + + SSAUpdater SSAUpdate; + Type *Ty = BonusInst.getType(); + SmallVector BCSSAPHIs; + SSAUpdate.Initialize(Ty, BonusInst.getName()); + + // Into each successor block of BB, insert a PHI node, that receives + // the BonusInst when coming from it's basic block, or poison otherwise. + for (BasicBlock *Succ : successors(BB)) { + // The block may have the same successor multiple times. Do it only once. + if (SSAUpdate.HasValueForBlock(Succ)) + continue; + BCSSAPHIs.emplace_back(PHINode::Create( + Ty, 0, BonusInst.getName() + ".bcssa", &Succ->front())); + PHINode *PN = BCSSAPHIs.back(); + for (BasicBlock *PredOfSucc : predecessors(Succ)) + PN->addIncoming(PredOfSucc == BB ? (Value *)&BonusInst + : PoisonValue::get(Ty), + PredOfSucc); + SSAUpdate.AddAvailableValue(Succ, PN); + } + + // And rewrite all uses that break block-closed SSA form. + for (Use &U : make_early_inc_range(BonusInst.uses())) + if (!IsBCSSAUse(U)) + SSAUpdate.RewriteUseAfterInsertions(U); + + // We might not have ended up needing PHI's in all of the succ blocks, + // drop the ones that are certainly unused, but don't bother otherwise. + for (PHINode *PN : BCSSAPHIs) + if (PN->use_empty()) + PN->eraseFromParent(); + } + IRBuilder<> Builder(PBI); // The builder is used to create instructions to eliminate the branch in BB. // If BB's terminator has !annotation metadata, add it to the new diff --git a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll index 2ff0418260771..d948b61d65a03 100644 --- a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll +++ b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll @@ -834,7 +834,7 @@ define void @pr48450() { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[COUNTDOWN:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[DEC_MERGE:%.*]], [[FOR_BODYTHREAD_PRE_SPLIT:%.*]] ] +; CHECK-NEXT: [[COUNTDOWN:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[DEC_BCSSA1:%.*]], [[FOR_BODYTHREAD_PRE_SPLIT:%.*]] ] ; CHECK-NEXT: [[C:%.*]] = call i1 @gen1() ; CHECK-NEXT: br i1 [[C]], label [[FOR_INC:%.*]], label [[IF_THEN:%.*]] ; CHECK: for.inc: @@ -849,7 +849,7 @@ define void @pr48450() { ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[C2_NOT]], i1 true, i1 [[CMP_NOT]] ; CHECK-NEXT: br i1 [[OR_COND]], label [[IF_END_LOOPEXIT]], label [[FOR_BODYTHREAD_PRE_SPLIT]] ; CHECK: for.bodythread-pre-split: -; CHECK-NEXT: [[DEC_MERGE]] = phi i8 [ [[DEC]], [[IF_THEN]] ], [ [[DEC_OLD]], [[FOR_INC]] ] +; CHECK-NEXT: [[DEC_BCSSA1]] = phi i8 [ [[DEC_OLD]], [[FOR_INC]] ], [ [[DEC]], [[IF_THEN]] ] ; CHECK-NEXT: call void @sideeffect0() ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: if.end.loopexit: @@ -885,7 +885,7 @@ define void @pr48450_2(i1 %enable_loopback) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[COUNTDOWN:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[DEC_MERGE:%.*]], [[FOR_BODYTHREAD_PRE_SPLIT:%.*]] ] +; CHECK-NEXT: [[COUNTDOWN:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[DEC_BCSSA1:%.*]], [[FOR_BODYTHREAD_PRE_SPLIT:%.*]] ] ; CHECK-NEXT: [[C:%.*]] = call i1 @gen1() ; CHECK-NEXT: br i1 [[C]], label [[FOR_INC:%.*]], label [[IF_THEN:%.*]] ; CHECK: for.inc: @@ -900,7 +900,7 @@ define void @pr48450_2(i1 %enable_loopback) { ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[C2_NOT]], i1 true, i1 [[CMP_NOT]] ; CHECK-NEXT: br i1 [[OR_COND]], label [[IF_END_LOOPEXIT]], label [[FOR_BODYTHREAD_PRE_SPLIT]] ; CHECK: for.bodythread-pre-split: -; CHECK-NEXT: [[DEC_MERGE]] = phi i8 [ [[DEC_OLD]], [[FOR_INC]] ], [ [[DEC_MERGE]], [[FOR_BODYTHREAD_PRE_SPLIT_LOOPBACK:%.*]] ], [ [[DEC]], [[IF_THEN]] ] +; CHECK-NEXT: [[DEC_BCSSA1]] = phi i8 [ poison, [[FOR_BODYTHREAD_PRE_SPLIT_LOOPBACK:%.*]] ], [ [[DEC_OLD]], [[FOR_INC]] ], [ [[DEC]], [[IF_THEN]] ] ; CHECK-NEXT: [[SHOULD_LOOPBACK:%.*]] = phi i1 [ true, [[FOR_INC]] ], [ false, [[FOR_BODYTHREAD_PRE_SPLIT_LOOPBACK]] ], [ true, [[IF_THEN]] ] ; CHECK-NEXT: [[DO_LOOPBACK:%.*]] = and i1 [[SHOULD_LOOPBACK]], [[ENABLE_LOOPBACK:%.*]] ; CHECK-NEXT: call void @sideeffect0() @@ -1005,8 +1005,8 @@ define void @pr49510() { ; CHECK-NEXT: [[TOBOOL_OLD:%.*]] = icmp ne i16 [[DOTOLD]], 0 ; CHECK-NEXT: br i1 [[TOBOOL_OLD]], label [[LAND_RHS:%.*]], label [[FOR_END:%.*]] ; CHECK: land.rhs: -; CHECK-NEXT: [[DOTMERGE:%.*]] = phi i16 [ [[TMP0:%.*]], [[LAND_RHS]] ], [ [[DOTOLD]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[DOTMERGE]], 0 +; CHECK-NEXT: [[DOTBCSSA:%.*]] = phi i16 [ [[DOTOLD]], [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LAND_RHS]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[DOTBCSSA]], 0 ; CHECK-NEXT: [[TMP0]] = load i16, i16* @global_pr49510, align 1 ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i16 [[TMP0]], 0 ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP]], i1 [[TOBOOL]], i1 false @@ -1043,15 +1043,15 @@ define i32 @pr51125() { ; CHECK-NEXT: [[ISZERO_OLD:%.*]] = icmp eq i32 [[LD_OLD]], 0 ; CHECK-NEXT: br i1 [[ISZERO_OLD]], label [[EXIT:%.*]], label [[L2:%.*]] ; CHECK: L2: -; CHECK-NEXT: [[LD_MERGE:%.*]] = phi i32 [ [[LD:%.*]], [[L2]] ], [ [[LD_OLD]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[LD_BCSSA1:%.*]] = phi i32 [ [[LD_OLD]], [[ENTRY:%.*]] ], [ [[LD:%.*]], [[L2]] ] ; CHECK-NEXT: store i32 -1, i32* @global_pr51125, align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[LD_MERGE]], -1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[LD_BCSSA1]], -1 ; CHECK-NEXT: [[LD]] = load i32, i32* @global_pr51125, align 4 ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[LD]], 0 ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP]], i1 true, i1 [[ISZERO]] ; CHECK-NEXT: br i1 [[OR_COND]], label [[EXIT]], label [[L2]] ; CHECK: exit: -; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[LD]], [[L2]] ], [ [[LD_OLD]], [[ENTRY]] ] +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[LD_BCSSA1]], [[L2]] ], [ [[LD_OLD]], [[ENTRY]] ] ; CHECK-NEXT: ret i32 [[R]] ; entry: From c6b7db015f6e203b3f1b1bb9f1468196a9fe7328 Mon Sep 17 00:00:00 2001 From: David Green Date: Sun, 15 Aug 2021 17:25:16 +0100 Subject: [PATCH 071/700] [InstCombine] Add call to matchSAddSubSat from min/max This adds a call to matchSAddSubSat from smin/smax instrinsics, allowing the same patterns to match if the canonical form of a min/max is an intrinsics, not a icmp/select. Differential Revision: https://reviews.llvm.org/D108077 --- .../InstCombine/InstCombineCalls.cpp | 3 + .../InstCombine/InstCombineInternal.h | 2 +- .../InstCombine/InstCombineSelect.cpp | 9 +- llvm/test/Transforms/InstCombine/sadd_sat.ll | 118 +++++++++--------- 4 files changed, 71 insertions(+), 61 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index a532c8e468683..25597840cad38 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1109,6 +1109,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { if (Instruction *Sel = foldClampRangeOfTwo(II, Builder)) return Sel; + if (Instruction *SAdd = matchSAddSubSat(*II)) + return SAdd; + if (match(I1, m_ImmConstant())) if (auto *Sel = dyn_cast(I0)) if (Instruction *R = FoldOpIntoSelect(*II, Sel)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index cefb947567e98..aaa3f7297ae61 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -326,7 +326,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final Instruction *narrowMathIfNoOverflow(BinaryOperator &I); Instruction *narrowFunnelShift(TruncInst &Trunc); Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN); - Instruction *matchSAddSubSat(SelectInst &MinMax1); + Instruction *matchSAddSubSat(Instruction &MinMax1); void freelyInvertAllUsersOf(Value *V); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 14bf26d74fad0..946f70da083ac 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -2182,7 +2182,7 @@ static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X, } /// Match a sadd_sat or ssub_sat which is using min/max to clamp the value. -Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) { +Instruction *InstCombinerImpl::matchSAddSubSat(Instruction &MinMax1) { Type *Ty = MinMax1.getType(); // We are looking for a tree of: @@ -2212,9 +2212,10 @@ Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) { if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth)) return nullptr; - // Also make sure that the number of uses is as expected. The "3"s are for the - // the two items of min/max (the compare and the select). - if (MinMax2->hasNUsesOrMore(3) || AddSub->hasNUsesOrMore(3)) + // Also make sure that the number of uses is as expected. The 3 is for the + // the two items of the compare and the select, or 2 from a min/max. + unsigned ExpUses = isa(MinMax1) ? 2 : 3; + if (MinMax2->hasNUsesOrMore(ExpUses) || AddSub->hasNUsesOrMore(ExpUses)) return nullptr; // Create the new type (which can be a vector type) diff --git a/llvm/test/Transforms/InstCombine/sadd_sat.ll b/llvm/test/Transforms/InstCombine/sadd_sat.ll index ff4a5e656fec8..b9a4771f6eeb4 100644 --- a/llvm/test/Transforms/InstCombine/sadd_sat.ll +++ b/llvm/test/Transforms/InstCombine/sadd_sat.ll @@ -24,13 +24,8 @@ entry: define i32 @sadd_sat32_mm(i32 %a, i32 %b) { ; CHECK-LABEL: @sadd_sat32_mm( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[A:%.*]] to i64 -; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[CONV1]], [[CONV]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 2147483647) -; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648) -; CHECK-NEXT: [[CONV7:%.*]] = trunc i64 [[SPEC_STORE_SELECT8]] to i32 -; CHECK-NEXT: ret i32 [[CONV7]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[B:%.*]], i32 [[A:%.*]]) +; CHECK-NEXT: ret i32 [[TMP0]] ; entry: %conv = sext i32 %a to i64 @@ -63,13 +58,8 @@ entry: define i32 @ssub_sat32_mm(i32 %a, i32 %b) { ; CHECK-LABEL: @ssub_sat32_mm( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[A:%.*]] to i64 -; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i64 [[CONV]], [[CONV1]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[SUB]], i64 2147483647) -; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648) -; CHECK-NEXT: [[CONV7:%.*]] = trunc i64 [[SPEC_STORE_SELECT8]] to i32 -; CHECK-NEXT: ret i32 [[CONV7]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[A:%.*]], i32 [[B:%.*]]) +; CHECK-NEXT: ret i32 [[TMP0]] ; entry: %conv = sext i32 %a to i64 @@ -148,13 +138,8 @@ entry: define signext i16 @sadd_sat16_mm(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: @sadd_sat16_mm( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[A:%.*]] to i32 -; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[B:%.*]] to i32 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV1]], [[CONV]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i32 @llvm.smin.i32(i32 [[ADD]], i32 32767) -; CHECK-NEXT: [[SPEC_STORE_SELECT10:%.*]] = call i32 @llvm.smax.i32(i32 [[SPEC_STORE_SELECT]], i32 -32768) -; CHECK-NEXT: [[CONV9:%.*]] = trunc i32 [[SPEC_STORE_SELECT10]] to i16 -; CHECK-NEXT: ret i16 [[CONV9]] +; CHECK-NEXT: [[TMP0:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[B:%.*]], i16 [[A:%.*]]) +; CHECK-NEXT: ret i16 [[TMP0]] ; entry: %conv = sext i16 %a to i32 @@ -187,13 +172,8 @@ entry: define signext i16 @ssub_sat16_mm(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: @ssub_sat16_mm( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[A:%.*]] to i32 -; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[B:%.*]] to i32 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV1]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i32 @llvm.smin.i32(i32 [[SUB]], i32 32767) -; CHECK-NEXT: [[SPEC_STORE_SELECT10:%.*]] = call i32 @llvm.smax.i32(i32 [[SPEC_STORE_SELECT]], i32 -32768) -; CHECK-NEXT: [[CONV9:%.*]] = trunc i32 [[SPEC_STORE_SELECT10]] to i16 -; CHECK-NEXT: ret i16 [[CONV9]] +; CHECK-NEXT: [[TMP0:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[A:%.*]], i16 [[B:%.*]]) +; CHECK-NEXT: ret i16 [[TMP0]] ; entry: %conv = sext i16 %a to i32 @@ -226,13 +206,8 @@ entry: define signext i8 @sadd_sat8_mm(i8 signext %a, i8 signext %b) { ; CHECK-LABEL: @sadd_sat8_mm( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[A:%.*]] to i32 -; CHECK-NEXT: [[CONV1:%.*]] = sext i8 [[B:%.*]] to i32 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV1]], [[CONV]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i32 @llvm.smin.i32(i32 [[ADD]], i32 127) -; CHECK-NEXT: [[SPEC_STORE_SELECT10:%.*]] = call i32 @llvm.smax.i32(i32 [[SPEC_STORE_SELECT]], i32 -128) -; CHECK-NEXT: [[CONV9:%.*]] = trunc i32 [[SPEC_STORE_SELECT10]] to i8 -; CHECK-NEXT: ret i8 [[CONV9]] +; CHECK-NEXT: [[TMP0:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[B:%.*]], i8 [[A:%.*]]) +; CHECK-NEXT: ret i8 [[TMP0]] ; entry: %conv = sext i8 %a to i32 @@ -265,13 +240,8 @@ entry: define signext i8 @ssub_sat8_mm(i8 signext %a, i8 signext %b) { ; CHECK-LABEL: @ssub_sat8_mm( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[A:%.*]] to i32 -; CHECK-NEXT: [[CONV1:%.*]] = sext i8 [[B:%.*]] to i32 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV1]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i32 @llvm.smin.i32(i32 [[SUB]], i32 127) -; CHECK-NEXT: [[SPEC_STORE_SELECT10:%.*]] = call i32 @llvm.smax.i32(i32 [[SPEC_STORE_SELECT]], i32 -128) -; CHECK-NEXT: [[CONV9:%.*]] = trunc i32 [[SPEC_STORE_SELECT10]] to i8 -; CHECK-NEXT: ret i8 [[CONV9]] +; CHECK-NEXT: [[TMP0:%.*]] = call i8 @llvm.ssub.sat.i8(i8 [[A:%.*]], i8 [[B:%.*]]) +; CHECK-NEXT: ret i8 [[TMP0]] ; entry: %conv = sext i8 %a to i32 @@ -390,13 +360,8 @@ entry: define <4 x i32> @sadd_satv4i32_mm(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @sadd_satv4i32_mm( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CONV:%.*]] = sext <4 x i32> [[A:%.*]] to <4 x i64> -; CHECK-NEXT: [[CONV1:%.*]] = sext <4 x i32> [[B:%.*]] to <4 x i64> -; CHECK-NEXT: [[ADD:%.*]] = add nsw <4 x i64> [[CONV1]], [[CONV]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[ADD]], <4 x i64> ) -; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[SPEC_STORE_SELECT]], <4 x i64> ) -; CHECK-NEXT: [[CONV7:%.*]] = trunc <4 x i64> [[SPEC_STORE_SELECT8]] to <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[CONV7]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]]) +; CHECK-NEXT: ret <4 x i32> [[TMP0]] ; entry: %conv = sext <4 x i32> %a to <4 x i64> @@ -429,13 +394,8 @@ entry: define <4 x i32> @ssub_satv4i32_mm(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @ssub_satv4i32_mm( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CONV:%.*]] = sext <4 x i32> [[A:%.*]] to <4 x i64> -; CHECK-NEXT: [[CONV1:%.*]] = sext <4 x i32> [[B:%.*]] to <4 x i64> -; CHECK-NEXT: [[ADD:%.*]] = sub nsw <4 x i64> [[CONV1]], [[CONV]] -; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[ADD]], <4 x i64> ) -; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[SPEC_STORE_SELECT]], <4 x i64> ) -; CHECK-NEXT: [[CONV7:%.*]] = trunc <4 x i64> [[SPEC_STORE_SELECT8]] to <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[CONV7]] +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]]) +; CHECK-NEXT: ret <4 x i32> [[TMP0]] ; entry: %conv = sext <4 x i32> %a to <4 x i64> @@ -534,6 +494,29 @@ entry: ret i32 %conv7 } +define i32 @sadd_sat32_extrause_2_mm(i32 %a, i32 %b) { +; CHECK-LABEL: @sadd_sat32_extrause_2_mm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[A:%.*]] to i64 +; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[CONV1]], [[CONV]] +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 2147483647) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648) +; CHECK-NEXT: [[CONV7:%.*]] = trunc i64 [[SPEC_STORE_SELECT8]] to i32 +; CHECK-NEXT: call void @use64(i64 [[SPEC_STORE_SELECT]]) +; CHECK-NEXT: ret i32 [[CONV7]] +; +entry: + %conv = sext i32 %a to i64 + %conv1 = sext i32 %b to i64 + %add = add i64 %conv1, %conv + %spec.store.select = call i64 @llvm.smin.i64(i64 %add, i64 2147483647) + %spec.store.select8 = call i64 @llvm.smax.i64(i64 %spec.store.select, i64 -2147483648) + %conv7 = trunc i64 %spec.store.select8 to i32 + call void @use64(i64 %spec.store.select) + ret i32 %conv7 +} + define i32 @sadd_sat32_extrause_3(i32 %a, i32 %b) { ; CHECK-LABEL: @sadd_sat32_extrause_3( ; CHECK-NEXT: entry: @@ -561,6 +544,29 @@ entry: ret i32 %conv7 } +define i32 @sadd_sat32_extrause_3_mm(i32 %a, i32 %b) { +; CHECK-LABEL: @sadd_sat32_extrause_3_mm( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[A:%.*]] to i64 +; CHECK-NEXT: [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[CONV1]], [[CONV]] +; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 2147483647) +; CHECK-NEXT: [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648) +; CHECK-NEXT: [[CONV7:%.*]] = trunc i64 [[SPEC_STORE_SELECT8]] to i32 +; CHECK-NEXT: call void @use64(i64 [[ADD]]) +; CHECK-NEXT: ret i32 [[CONV7]] +; +entry: + %conv = sext i32 %a to i64 + %conv1 = sext i32 %b to i64 + %add = add i64 %conv1, %conv + %spec.store.select = call i64 @llvm.smin.i64(i64 %add, i64 2147483647) + %spec.store.select8 = call i64 @llvm.smax.i64(i64 %spec.store.select, i64 -2147483648) + %conv7 = trunc i64 %spec.store.select8 to i32 + call void @use64(i64 %add) + ret i32 %conv7 +} + define i32 @sadd_sat32_trunc(i32 %a, i32 %b) { ; CHECK-LABEL: @sadd_sat32_trunc( ; CHECK-NEXT: entry: From e6e687f2d993317995c101f95b9a622a35975228 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 15 Aug 2021 09:31:39 -0700 Subject: [PATCH 072/700] [AsmParser] Remove MDSignedOrUnsignedField (NFC) The last use was removed on Apr 18, 2020 in commit aad3d578da0ddf6d0d3d95e5e09a32e47f6dfeb8. --- llvm/lib/AsmParser/LLParser.cpp | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 9ae10d76692a1..64af1dc3751e4 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -3914,22 +3914,6 @@ struct MDSignedOrMDField : MDEitherFieldImpl { } }; -struct MDSignedOrUnsignedField - : MDEitherFieldImpl { - MDSignedOrUnsignedField() : ImplTy(MDSignedField(0), MDUnsignedField(0)) {} - - bool isMDSignedField() const { return WhatIs == IsTypeA; } - bool isMDUnsignedField() const { return WhatIs == IsTypeB; } - int64_t getMDSignedValue() const { - assert(isMDSignedField() && "Wrong field type"); - return A.Val; - } - uint64_t getMDUnsignedValue() const { - assert(isMDUnsignedField() && "Wrong field type"); - return B.Val; - } -}; - } // end anonymous namespace namespace llvm { From 20170447b06d70a90883809159848a44ac299584 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sun, 15 Aug 2021 13:21:36 -0400 Subject: [PATCH 073/700] [gn build] (manually) port 957334382cd1 --- llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index 74828a6e9acf9..1e4fb7870cc2d 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -162,6 +162,7 @@ write_cmake_config("config") { "HAVE_MALLOC_ZONE_STATISTICS=1", "HAVE_PROC_PID_RUSAGE=1", "HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC=1", + "HAVE_UNW_ADD_DYNAMIC_FDE=1", ] } else { values += [ @@ -173,6 +174,7 @@ write_cmake_config("config") { "HAVE_MALLOC_ZONE_STATISTICS=", "HAVE_PROC_PID_RUSAGE=", "HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC=", + "HAVE_UNW_ADD_DYNAMIC_FDE=", ] } From cd0e1964137f1cd7b508809ec80c7d9dcb3f0458 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Sun, 15 Aug 2021 14:24:20 +0100 Subject: [PATCH 074/700] [DAGCombiner] Stop visitEXTRACT_SUBVECTOR creating illegal BITCASTs post legalisation. visitEXTRACT_SUBVECTOR can sometimes create illegal BITCASTs when removing "redundant" INSERT_SUBVECTOR operations. This patch adds an extra check to ensure such combines only occur after operation legalisation if any resulting BITBAST is itself legal. Differential Revision: https://reviews.llvm.org/D108086 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +++- .../sve-fixed-length-masked-scatter.ll | 31 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3e008ae76ca1a..bf83ccf535a9f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20583,8 +20583,12 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { // otherwise => (extract_subvec V1, ExtIdx) uint64_t InsIdx = V.getConstantOperandVal(2); if (InsIdx * SmallVT.getScalarSizeInBits() == - ExtIdx * NVT.getScalarSizeInBits()) + ExtIdx * NVT.getScalarSizeInBits()) { + if (LegalOperations && !TLI.isOperationLegal(ISD::BITCAST, NVT)) + return SDValue(); + return DAG.getBitcast(NVT, V.getOperand(1)); + } return DAG.getNode( ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)), diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index ec0f4bdf025c6..2ce98f00687df 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -1057,6 +1057,37 @@ define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 { ret void } +; extract_subvec(...(insert_subvec(a,b,c))) -> extract_subvec(bitcast(b),d) like +; combines can effectively unlegalise bitcast operations. This test ensures such +; combines do not happen after operation legalisation. When not prevented the +; test triggers infinite combine->legalise->combine->... +; +; NOTE: For this test to function correctly it's critical for %vals to be in a +; different block to the scatter store. If not, the problematic bitcast will be +; removed before operation legalisation and thus not exercise the combine. +define void @masked_scatter_bitcast_infinite_loop(<8 x double>* %a, <8 x double*>* %b, i1 %cond) #0 { +; CHECK-LABEL: masked_scatter_bitcast_infinite_loop +; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] +; VBITS_GE_512-NEXT: tbz w2, #0, [[LABEL:.*]] +; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] +; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0 +; VBITS_GE_512-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_512-NEXT: [[LABEL]]: +; VBITS_GE_512-NEXT: ret + %vals = load volatile <8 x double>, <8 x double>* %a + br i1 %cond, label %bb.1, label %bb.2 + +bb.1: + %ptrs = load <8 x double*>, <8 x double*>* %b + %mask = fcmp oeq <8 x double> %vals, zeroinitializer + call void @llvm.masked.scatter.v8f64(<8 x double> %vals, <8 x double*> %ptrs, i32 8, <8 x i1> %mask) + br label %bb.2 + +bb.2: + ret void +} + declare void @llvm.masked.scatter.v2i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>) declare void @llvm.masked.scatter.v4i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) From 8721490d3859b4d7fff633bdf3c71d888e75727c Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 15 Aug 2021 13:39:28 -0400 Subject: [PATCH 075/700] [x86] split memcmp tests for 32/64-bit targets; NFC memcmp is defined as taking a size_t length arg, so that differs depending on pointer size of the target. We casually matched non-compliant function signatures as memcmp, but that can cause crashing as seen with PR50850. If we fix that bug, these tests would no longer be testing the expected behavior for a 32-bit target, so I have duplicated all tests and adjusted them to match the stricter definition of memcmp/bcmp by changing the length arg to i32 on a 32-bit target. --- llvm/test/CodeGen/X86/memcmp-minsize-x32.ll | 445 +++ llvm/test/CodeGen/X86/memcmp-minsize.ll | 323 -- .../CodeGen/X86/memcmp-more-load-pairs-x32.ll | 2916 +++++++++++++++++ .../CodeGen/X86/memcmp-more-load-pairs.ll | 2422 -------------- llvm/test/CodeGen/X86/memcmp-optsize-x32.ll | 584 ++++ llvm/test/CodeGen/X86/memcmp-optsize.ll | 451 --- llvm/test/CodeGen/X86/memcmp-pgso-x32.ll | 601 ++++ llvm/test/CodeGen/X86/memcmp-pgso.ll | 450 --- llvm/test/CodeGen/X86/memcmp-x32.ll | 2434 ++++++++++++++ llvm/test/CodeGen/X86/memcmp.ll | 1926 ----------- .../Transforms/ExpandMemCmp/X86/memcmp-x32.ll | 615 ++++ .../Transforms/ExpandMemCmp/X86/memcmp.ll | 619 ++-- 12 files changed, 7786 insertions(+), 6000 deletions(-) create mode 100644 llvm/test/CodeGen/X86/memcmp-minsize-x32.ll create mode 100644 llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll create mode 100644 llvm/test/CodeGen/X86/memcmp-optsize-x32.ll create mode 100644 llvm/test/CodeGen/X86/memcmp-pgso-x32.ll create mode 100644 llvm/test/CodeGen/X86/memcmp-x32.ll create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll diff --git a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll new file mode 100644 index 0000000000000..9f56c38062f68 --- /dev/null +++ b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll @@ -0,0 +1,445 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 + +; This tests codegen time inlining/optimization of memcmp +; rdar://6480398 + +@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1 + +declare dso_local i32 @memcmp(i8*, i8*, i32) + +define i32 @length2(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length2: +; X86: # %bb.0: +; X86-NEXT: pushl $2 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + ret i32 %m +} + +define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length2_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length2_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw $12849, (%eax) # imm = 0x3231 +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 2) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length2_eq_nobuiltin_attr: +; X86: # %bb.0: +; X86-NEXT: pushl $2 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length3(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length3: +; X86: # %bb.0: +; X86-NEXT: pushl $3 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind + ret i32 %m +} + +define i1 @length3_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length3_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $3 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length4(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length4: +; X86: # %bb.0: +; X86-NEXT: pushl $4 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + ret i32 %m +} + +define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length4_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length4_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length4_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 4) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length5(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length5: +; X86: # %bb.0: +; X86-NEXT: pushl $5 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind + ret i32 %m +} + +define i1 @length5_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length5_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $5 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length8(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length8: +; X86: # %bb.0: +; X86-NEXT: pushl $8 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind + ret i32 %m +} + +define i1 @length8_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length8_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $8 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length8_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length8_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $8 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 8) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length12_eq(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length12_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length12(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length12: +; X86: # %bb.0: +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind + ret i32 %m +} + +; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 + +define i32 @length16(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length16: +; X86: # %bb.0: +; X86-NEXT: pushl $16 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 16) nounwind + ret i32 %m +} + +define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize { +; X86-NOSSE-LABEL: length16_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_eq_const(i8* %X) nounwind minsize { +; X86-NOSSE-LABEL: length16_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 16) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 + +define i32 @length24(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length24: +; X86: # %bb.0: +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 24) nounwind + ret i32 %m +} + +define i1 @length24_eq(i8* %x, i8* %y) nounwind minsize { +; X86-LABEL: length24_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length24_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length24_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $24 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 24) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length32(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length32: +; X86: # %bb.0: +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 32) nounwind + ret i32 %m +} + +; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 + +define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize { +; X86-LABEL: length32_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length32_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $32 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 32) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length64(i8* %X, i8* %Y) nounwind minsize { +; X86-LABEL: length64: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 64) nounwind + ret i32 %m +} + +define i1 @length64_eq(i8* %x, i8* %y) nounwind minsize { +; X86-LABEL: length64_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_eq_const(i8* %X) nounwind minsize { +; X86-LABEL: length64_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 64) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + diff --git a/llvm/test/CodeGen/X86/memcmp-minsize.ll b/llvm/test/CodeGen/X86/memcmp-minsize.ll index fec09f6b8d364..3bcd79c4c0cd4 100644 --- a/llvm/test/CodeGen/X86/memcmp-minsize.ll +++ b/llvm/test/CodeGen/X86/memcmp-minsize.ll @@ -1,6 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 @@ -13,16 +11,6 @@ declare dso_local i32 @memcmp(i8*, i8*, i64) define i32 @length2(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length2: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $2 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length2: ; X64: # %bb.0: ; X64-NEXT: pushq $2 @@ -33,15 +21,6 @@ define i32 @length2(i8* %X, i8* %Y) nounwind minsize { } define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length2_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: cmpw (%eax), %cx -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -54,13 +33,6 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize { } define i1 @length2_eq_const(i8* %X) nounwind minsize { -; X86-LABEL: length2_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpw $12849, (%eax) # imm = 0x3231 -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq_const: ; X64: # %bb.0: ; X64-NEXT: cmpw $12849, (%rdi) # imm = 0x3231 @@ -72,18 +44,6 @@ define i1 @length2_eq_const(i8* %X) nounwind minsize { } define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length2_eq_nobuiltin_attr: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $2 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq_nobuiltin_attr: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -100,16 +60,6 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind minsize { } define i32 @length3(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length3: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length3: ; X64: # %bb.0: ; X64-NEXT: pushq $3 @@ -120,18 +70,6 @@ define i32 @length3(i8* %X, i8* %Y) nounwind minsize { } define i1 @length3_eq(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length3_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length3_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -148,16 +86,6 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind minsize { } define i32 @length4(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length4: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $4 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length4: ; X64: # %bb.0: ; X64-NEXT: pushq $4 @@ -168,15 +96,6 @@ define i32 @length4(i8* %X, i8* %Y) nounwind minsize { } define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length4_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: cmpl (%eax), %ecx -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length4_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -189,13 +108,6 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize { } define i1 @length4_eq_const(i8* %X) nounwind minsize { -; X86-LABEL: length4_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length4_eq_const: ; X64: # %bb.0: ; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 @@ -207,16 +119,6 @@ define i1 @length4_eq_const(i8* %X) nounwind minsize { } define i32 @length5(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length5: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $5 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length5: ; X64: # %bb.0: ; X64-NEXT: pushq $5 @@ -227,18 +129,6 @@ define i32 @length5(i8* %X, i8* %Y) nounwind minsize { } define i1 @length5_eq(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length5_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $5 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length5_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -255,16 +145,6 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind minsize { } define i32 @length8(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length8: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length8: ; X64: # %bb.0: ; X64-NEXT: pushq $8 @@ -275,18 +155,6 @@ define i32 @length8(i8* %X, i8* %Y) nounwind minsize { } define i1 @length8_eq(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length8_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length8_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -299,18 +167,6 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind minsize { } define i1 @length8_eq_const(i8* %X) nounwind minsize { -; X86-LABEL: length8_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $8 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length8_eq_const: ; X64: # %bb.0: ; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 @@ -323,18 +179,6 @@ define i1 @length8_eq_const(i8* %X) nounwind minsize { } define i1 @length12_eq(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length12_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length12_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -351,16 +195,6 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind minsize { } define i32 @length12(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length12: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length12: ; X64: # %bb.0: ; X64-NEXT: pushq $12 @@ -373,15 +207,6 @@ define i32 @length12(i8* %X, i8* %Y) nounwind minsize { ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 define i32 @length16(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length16: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $16 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl ; ; X64-LABEL: length16: ; X64: # %bb.0: @@ -393,30 +218,6 @@ define i32 @length16(i8* %X, i8* %Y) nounwind minsize { } define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize { -; X86-NOSSE-LABEL: length16_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu (%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length16_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rsi), %xmm0 @@ -440,28 +241,6 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize { } define i1 @length16_eq_const(i8* %X) nounwind minsize { -; X86-NOSSE-LABEL: length16_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length16_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -486,16 +265,6 @@ define i1 @length16_eq_const(i8* %X) nounwind minsize { ; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 define i32 @length24(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length24: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length24: ; X64: # %bb.0: ; X64-NEXT: pushq $24 @@ -506,18 +275,6 @@ define i32 @length24(i8* %X, i8* %Y) nounwind minsize { } define i1 @length24_eq(i8* %x, i8* %y) nounwind minsize { -; X86-LABEL: length24_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length24_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -534,18 +291,6 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind minsize { } define i1 @length24_eq_const(i8* %X) nounwind minsize { -; X86-LABEL: length24_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length24_eq_const: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -563,16 +308,6 @@ define i1 @length24_eq_const(i8* %X) nounwind minsize { } define i32 @length32(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length32: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length32: ; X64: # %bb.0: ; X64-NEXT: pushq $32 @@ -585,18 +320,6 @@ define i32 @length32(i8* %X, i8* %Y) nounwind minsize { ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize { -; X86-LABEL: length32_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE2-LABEL: length32_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pushq %rax @@ -631,18 +354,6 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize { } define i1 @length32_eq_const(i8* %X) nounwind minsize { -; X86-LABEL: length32_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE2-LABEL: length32_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pushq %rax @@ -678,16 +389,6 @@ define i1 @length32_eq_const(i8* %X) nounwind minsize { } define i32 @length64(i8* %X, i8* %Y) nounwind minsize { -; X86-LABEL: length64: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length64: ; X64: # %bb.0: ; X64-NEXT: pushq $64 @@ -698,18 +399,6 @@ define i32 @length64(i8* %X, i8* %Y) nounwind minsize { } define i1 @length64_eq(i8* %x, i8* %y) nounwind minsize { -; X86-LABEL: length64_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length64_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -726,18 +415,6 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind minsize { } define i1 @length64_eq_const(i8* %X) nounwind minsize { -; X86-LABEL: length64_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length64_eq_const: ; X64: # %bb.0: ; X64-NEXT: pushq %rax diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll new file mode 100644 index 0000000000000..edfaaaed7d849 --- /dev/null +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll @@ -0,0 +1,2916 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way. +; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE +; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86,X86-SSE1 +; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2 +; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86,X86-SSE41 + +; This tests codegen time inlining/optimization of memcmp +; rdar://6480398 + +@.str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1 + +declare dso_local i32 @memcmp(i8*, i8*, i32) + +define i32 @length0(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length0: +; X86: # %bb.0: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind + ret i32 %m + } + +define i1 @length0_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length0_eq: +; X86: # %bb.0: +; X86-NEXT: movb $1, %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length0_lt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length0_lt: +; X86: # %bb.0: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind + %c = icmp slt i32 %m, 0 + ret i1 %c +} + +define i32 @length2(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + ret i32 %m +} + +define i1 @length2_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length2_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length2_lt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length2_lt: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + %c = icmp slt i32 %m, 0 + ret i1 %c +} + +define i1 @length2_gt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length2_gt: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %ax +; X86-NEXT: movzwl %cx, %ecx +; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: setg %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + %c = icmp sgt i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_const(i8* %X) nounwind { +; X86-LABEL: length2_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 2) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length2_eq_nobuiltin_attr: +; X86: # %bb.0: +; X86-NEXT: pushl $2 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length3(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length3: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: rolw $8, %dx +; X86-NEXT: rolw $8, %si +; X86-NEXT: cmpw %si, %dx +; X86-NEXT: jne .LBB9_3 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB9_3: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind + ret i32 %m +} + +define i1 @length3_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length3_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: xorw (%eax), %dx +; X86-NEXT: movb 2(%ecx), %cl +; X86-NEXT: xorb 2(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orw %dx, %ax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length4(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length4: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: seta %al +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + ret i32 %m +} + +define i1 @length4_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length4_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length4_lt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length4_lt: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: seta %al +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + %c = icmp slt i32 %m, 0 + ret i1 %c +} + +define i1 @length4_gt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length4_gt: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: seta %dl +; X86-NEXT: sbbl $0, %edx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: setg %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + %c = icmp sgt i32 %m, 0 + ret i1 %c +} + +define i1 @length4_eq_const(i8* %X) nounwind { +; X86-LABEL: length4_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 4) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length5(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length5: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB16_3 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB16_3: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind + ret i32 %m +} + +define i1 @length5_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length5_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: movb 4(%ecx), %cl +; X86-NEXT: xorb 4(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length5_lt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length5_lt: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB18_3 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB18_2 +; X86-NEXT: .LBB18_3: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB18_2: # %endblock +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind + %c = icmp slt i32 %m, 0 + ret i1 %c +} + +define i32 @length7(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length7: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB19_2 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 3(%esi), %ecx +; X86-NEXT: movl 3(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB19_3 +; X86-NEXT: .LBB19_2: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB19_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind + ret i32 %m +} + +define i1 @length7_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length7_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 3(%ecx), %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 3(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length7_lt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length7_lt: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB21_2 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 3(%esi), %ecx +; X86-NEXT: movl 3(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB21_3 +; X86-NEXT: .LBB21_2: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB21_3: # %endblock +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind + %c = icmp slt i32 %m, 0 + ret i1 %c +} + +define i32 @length8(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length8: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB22_2 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB22_3 +; X86-NEXT: .LBB22_2: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB22_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind + ret i32 %m +} + +define i1 @length8_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length8_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length8_eq_const(i8* %X) nounwind { +; X86-LABEL: length8_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 +; X86-NEXT: xorl (%eax), %ecx +; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 +; X86-NEXT: xorl 4(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 8) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length9_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length9_eq: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %esi +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movb 8(%ecx), %cl +; X86-NEXT: xorb 8(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: sete %al +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length10_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length10_eq: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %esi +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movzwl 8(%ecx), %ecx +; X86-NEXT: xorw 8(%eax), %cx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: sete %al +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 10) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length11_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length11_eq: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %esi +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl 7(%ecx), %ecx +; X86-NEXT: xorl 7(%eax), %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: sete %al +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 11) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length12_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length12_eq: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %esi +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl 8(%ecx), %ecx +; X86-NEXT: xorl 8(%eax), %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: setne %al +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length12(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length12: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB29_3 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB29_3 +; X86-NEXT: # %bb.2: # %loadbb2 +; X86-NEXT: movl 8(%esi), %ecx +; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB29_4 +; X86-NEXT: .LBB29_3: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB29_4: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind + ret i32 %m +} + +define i1 @length13_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length13_eq: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %esi +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: xorl 8(%eax), %edx +; X86-NEXT: movb 12(%ecx), %cl +; X86-NEXT: xorb 12(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: sete %al +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 13) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length14_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length14_eq: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %esi +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: xorl 8(%eax), %edx +; X86-NEXT: movzwl 12(%ecx), %ecx +; X86-NEXT: xorw 12(%eax), %cx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: sete %al +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 14) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length15_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length15_eq: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %esi +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl 8(%ecx), %edx +; X86-NEXT: xorl 8(%eax), %edx +; X86-NEXT: movl 11(%ecx), %ecx +; X86-NEXT: xorl 11(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: sete %al +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 15) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 + +define i32 @length16(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length16: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB33_4 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB33_4 +; X86-NEXT: # %bb.2: # %loadbb2 +; X86-NEXT: movl 8(%esi), %ecx +; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB33_4 +; X86-NEXT: # %bb.3: # %loadbb3 +; X86-NEXT: movl 12(%esi), %ecx +; X86-NEXT: movl 12(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB33_5 +; X86-NEXT: .LBB33_4: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB33_5: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 16) nounwind + ret i32 %m +} + +define i1 @length16_eq(i8* %x, i8* %y) nounwind { +; X86-NOSSE-LABEL: length16_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movl (%ecx), %edx +; X86-NOSSE-NEXT: movl 4(%ecx), %esi +; X86-NOSSE-NEXT: xorl (%eax), %edx +; X86-NOSSE-NEXT: xorl 4(%eax), %esi +; X86-NOSSE-NEXT: orl %edx, %esi +; X86-NOSSE-NEXT: movl 8(%ecx), %edx +; X86-NOSSE-NEXT: xorl 8(%eax), %edx +; X86-NOSSE-NEXT: movl 12(%ecx), %ecx +; X86-NOSSE-NEXT: xorl 12(%eax), %ecx +; X86-NOSSE-NEXT: orl %edx, %ecx +; X86-NOSSE-NEXT: orl %esi, %ecx +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: popl %esi +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length16_eq: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movl (%ecx), %edx +; X86-SSE1-NEXT: movl 4(%ecx), %esi +; X86-SSE1-NEXT: xorl (%eax), %edx +; X86-SSE1-NEXT: xorl 4(%eax), %esi +; X86-SSE1-NEXT: orl %edx, %esi +; X86-SSE1-NEXT: movl 8(%ecx), %edx +; X86-SSE1-NEXT: xorl 8(%eax), %edx +; X86-SSE1-NEXT: movl 12(%ecx), %ecx +; X86-SSE1-NEXT: xorl 12(%eax), %ecx +; X86-SSE1-NEXT: orl %edx, %ecx +; X86-SSE1-NEXT: orl %esi, %ecx +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length16_eq: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu (%eax), %xmm1 +; X86-SSE41-NEXT: pxor %xmm0, %xmm1 +; X86-SSE41-NEXT: ptest %xmm1, %xmm1 +; X86-SSE41-NEXT: setne %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length16_lt: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB35_4 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB35_4 +; X86-NEXT: # %bb.2: # %loadbb2 +; X86-NEXT: movl 8(%esi), %ecx +; X86-NEXT: movl 8(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB35_4 +; X86-NEXT: # %bb.3: # %loadbb3 +; X86-NEXT: movl 12(%esi), %ecx +; X86-NEXT: movl 12(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB35_5 +; X86-NEXT: .LBB35_4: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB35_5: # %endblock +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: popl %esi +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length16_gt: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %eax +; X86-NEXT: movl (%edx), %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: bswapl %ecx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: jne .LBB36_4 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 4(%esi), %eax +; X86-NEXT: movl 4(%edx), %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: bswapl %ecx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: jne .LBB36_4 +; X86-NEXT: # %bb.2: # %loadbb2 +; X86-NEXT: movl 8(%esi), %eax +; X86-NEXT: movl 8(%edx), %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: bswapl %ecx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: jne .LBB36_4 +; X86-NEXT: # %bb.3: # %loadbb3 +; X86-NEXT: movl 12(%esi), %eax +; X86-NEXT: movl 12(%edx), %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: bswapl %ecx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: je .LBB36_5 +; X86-NEXT: .LBB36_4: # %res_block +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: setae %dl +; X86-NEXT: leal -1(%edx,%edx), %edx +; X86-NEXT: .LBB36_5: # %endblock +; X86-NEXT: testl %edx, %edx +; X86-NEXT: setg %al +; X86-NEXT: popl %esi +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_eq_const(i8* %X) nounwind { +; X86-NOSSE-LABEL: length16_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl $858927408, %ecx # imm = 0x33323130 +; X86-NOSSE-NEXT: xorl (%eax), %ecx +; X86-NOSSE-NEXT: movl $926299444, %edx # imm = 0x37363534 +; X86-NOSSE-NEXT: xorl 4(%eax), %edx +; X86-NOSSE-NEXT: orl %ecx, %edx +; X86-NOSSE-NEXT: movl $825243960, %ecx # imm = 0x31303938 +; X86-NOSSE-NEXT: xorl 8(%eax), %ecx +; X86-NOSSE-NEXT: movl $892613426, %esi # imm = 0x35343332 +; X86-NOSSE-NEXT: xorl 12(%eax), %esi +; X86-NOSSE-NEXT: orl %ecx, %esi +; X86-NOSSE-NEXT: orl %edx, %esi +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: popl %esi +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length16_eq_const: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl $858927408, %ecx # imm = 0x33323130 +; X86-SSE1-NEXT: xorl (%eax), %ecx +; X86-SSE1-NEXT: movl $926299444, %edx # imm = 0x37363534 +; X86-SSE1-NEXT: xorl 4(%eax), %edx +; X86-SSE1-NEXT: orl %ecx, %edx +; X86-SSE1-NEXT: movl $825243960, %ecx # imm = 0x31303938 +; X86-SSE1-NEXT: xorl 8(%eax), %ecx +; X86-SSE1-NEXT: movl $892613426, %esi # imm = 0x35343332 +; X86-SSE1-NEXT: xorl 12(%eax), %esi +; X86-SSE1-NEXT: orl %ecx, %esi +; X86-SSE1-NEXT: orl %edx, %esi +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length16_eq_const: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 16) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 + +define i32 @length24(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length24: +; X86: # %bb.0: +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 24) nounwind + ret i32 %m +} + +define i1 @length24_eq(i8* %x, i8* %y) nounwind { +; X86-NOSSE-LABEL: length24_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length24_eq: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $24 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length24_eq: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 8(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 8(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length24_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length24_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length24_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length24_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length24_eq_const(i8* %X) nounwind { +; X86-NOSSE-LABEL: length24_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length24_eq_const: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $24 +; X86-SSE1-NEXT: pushl $.L.str +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length24_eq_const: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu 8(%eax), %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: setne %al +; X86-SSE41-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 24) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length31(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length31: +; X86: # %bb.0: +; X86-NEXT: pushl $31 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 31) nounwind + ret i32 %m +} + +define i1 @length31_eq(i8* %x, i8* %y) nounwind { +; X86-NOSSE-LABEL: length31_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $31 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length31_eq: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $31 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length31_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 15(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length31_eq: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 15(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length31_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length31_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $31 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length31_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length31_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $31 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" { +; X86-NOSSE-LABEL: length31_eq_prefer128: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $31 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length31_eq_prefer128: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $31 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length31_eq_prefer128: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 15(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length31_eq_prefer128: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 15(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length31_eq_const(i8* %X) nounwind { +; X86-NOSSE-LABEL: length31_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $31 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length31_eq_const: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $31 +; X86-SSE1-NEXT: pushl $.L.str +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length31_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 15(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length31_eq_const: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu 15(%eax), %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: setne %al +; X86-SSE41-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 31) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length32(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length32: +; X86: # %bb.0: +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 32) nounwind + ret i32 %m +} + +; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 + +define i1 @length32_eq(i8* %x, i8* %y) nounwind { +; X86-NOSSE-LABEL: length32_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length32_eq: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $32 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length32_eq: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length32_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length32_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" { +; X86-NOSSE-LABEL: length32_eq_prefer128: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length32_eq_prefer128: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $32 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq_prefer128: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length32_eq_prefer128: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_const(i8* %X) nounwind { +; X86-NOSSE-LABEL: length32_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length32_eq_const: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $32 +; X86-SSE1-NEXT: pushl $.L.str +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length32_eq_const: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: setne %al +; X86-SSE41-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 32) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length48(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length48: +; X86: # %bb.0: +; X86-NEXT: pushl $48 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 48) nounwind + ret i32 %m +} + +define i1 @length48_eq(i8* %x, i8* %y) nounwind { +; X86-NOSSE-LABEL: length48_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $48 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length48_eq: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $48 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length48_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm0, %xmm3 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: pmovmskb %xmm3, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length48_eq: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 +; X86-SSE41-NEXT: pxor %xmm1, %xmm3 +; X86-SSE41-NEXT: por %xmm0, %xmm3 +; X86-SSE41-NEXT: por %xmm2, %xmm3 +; X86-SSE41-NEXT: ptest %xmm3, %xmm3 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length48_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length48_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $48 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length48_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length48_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $48 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" { +; X86-NOSSE-LABEL: length48_eq_prefer128: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $48 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length48_eq_prefer128: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $48 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length48_eq_prefer128: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm0, %xmm3 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: pmovmskb %xmm3, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length48_eq_prefer128: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 +; X86-SSE41-NEXT: pxor %xmm1, %xmm3 +; X86-SSE41-NEXT: por %xmm0, %xmm3 +; X86-SSE41-NEXT: por %xmm2, %xmm3 +; X86-SSE41-NEXT: ptest %xmm3, %xmm3 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length48_eq_const(i8* %X) nounwind { +; X86-NOSSE-LABEL: length48_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $48 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length48_eq_const: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $48 +; X86-SSE1-NEXT: pushl $.L.str +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length48_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pmovmskb %xmm2, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length48_eq_const: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE41-NEXT: por %xmm1, %xmm2 +; X86-SSE41-NEXT: por %xmm0, %xmm2 +; X86-SSE41-NEXT: ptest %xmm2, %xmm2 +; X86-SSE41-NEXT: setne %al +; X86-SSE41-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 48) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length63(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length63: +; X86: # %bb.0: +; X86-NEXT: pushl $63 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 63) nounwind + ret i32 %m +} + +define i1 @length63_eq(i8* %x, i8* %y) nounwind { +; X86-NOSSE-LABEL: length63_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $63 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length63_eq: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $63 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length63_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqu 47(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 47(%eax), %xmm4 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm0, %xmm4 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pmovmskb %xmm4, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length63_eq: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 +; X86-SSE41-NEXT: pxor %xmm1, %xmm3 +; X86-SSE41-NEXT: movdqu 47(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 47(%eax), %xmm4 +; X86-SSE41-NEXT: pxor %xmm1, %xmm4 +; X86-SSE41-NEXT: por %xmm3, %xmm4 +; X86-SSE41-NEXT: por %xmm0, %xmm4 +; X86-SSE41-NEXT: por %xmm2, %xmm4 +; X86-SSE41-NEXT: ptest %xmm4, %xmm4 +; X86-SSE41-NEXT: setne %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length63_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length63_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $63 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length63_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length63_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $63 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length63_eq_const(i8* %X) nounwind { +; X86-NOSSE-LABEL: length63_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $63 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length63_eq_const: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $63 +; X86-SSE1-NEXT: pushl $.L.str +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length63_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE2-NEXT: movdqu 47(%eax), %xmm3 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length63_eq_const: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE41-NEXT: movdqu 47(%eax), %xmm3 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE41-NEXT: por %xmm3, %xmm2 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: por %xmm2, %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 63) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length64(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length64: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 64) nounwind + ret i32 %m +} + +define i1 @length64_eq(i8* %x, i8* %y) nounwind { +; X86-NOSSE-LABEL: length64_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $64 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length64_eq: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $64 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length64_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 +; X86-SSE2-NEXT: movdqu 48(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 48(%eax), %xmm4 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm4 +; X86-SSE2-NEXT: pand %xmm3, %xmm4 +; X86-SSE2-NEXT: pand %xmm0, %xmm4 +; X86-SSE2-NEXT: pand %xmm2, %xmm4 +; X86-SSE2-NEXT: pmovmskb %xmm4, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length64_eq: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 +; X86-SSE41-NEXT: pxor %xmm1, %xmm3 +; X86-SSE41-NEXT: movdqu 48(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 48(%eax), %xmm4 +; X86-SSE41-NEXT: pxor %xmm1, %xmm4 +; X86-SSE41-NEXT: por %xmm3, %xmm4 +; X86-SSE41-NEXT: por %xmm0, %xmm4 +; X86-SSE41-NEXT: por %xmm2, %xmm4 +; X86-SSE41-NEXT: ptest %xmm4, %xmm4 +; X86-SSE41-NEXT: setne %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length64_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length64_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_eq_const(i8* %X) nounwind { +; X86-NOSSE-LABEL: length64_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $64 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length64_eq_const: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $64 +; X86-SSE1-NEXT: pushl $.L.str +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length64_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE2-NEXT: movdqu 48(%eax), %xmm3 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length64_eq_const: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE41-NEXT: movdqu 48(%eax), %xmm3 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; X86-SSE41-NEXT: por %xmm3, %xmm2 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: por %xmm2, %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 64) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length96(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length96: +; X86: # %bb.0: +; X86-NEXT: pushl $96 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 96) nounwind + ret i32 %m +} + +define i1 @length96_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length96_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $96 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length96_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length96_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $96 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length96_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length96_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $96 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length96_eq_const(i8* %X) nounwind { +; X86-LABEL: length96_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $96 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 96) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length127(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length127: +; X86: # %bb.0: +; X86-NEXT: pushl $127 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 127) nounwind + ret i32 %m +} + +define i1 @length127_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length127_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $127 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length127_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length127_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $127 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length127_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length127_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $127 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length127_eq_const(i8* %X) nounwind { +; X86-LABEL: length127_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $127 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 127) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length128(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length128: +; X86: # %bb.0: +; X86-NEXT: pushl $128 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 128) nounwind + ret i32 %m +} + +define i1 @length128_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length128_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $128 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length128_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length128_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $128 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length128_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length128_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $128 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length128_eq_const(i8* %X) nounwind { +; X86-LABEL: length128_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $128 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 128) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length192(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length192: +; X86: # %bb.0: +; X86-NEXT: pushl $192 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 192) nounwind + ret i32 %m +} + +define i1 @length192_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length192_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $192 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length192_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length192_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $192 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length192_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length192_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $192 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length192_eq_const(i8* %X) nounwind { +; X86-LABEL: length192_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $192 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 192) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length255(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length255: +; X86: # %bb.0: +; X86-NEXT: pushl $255 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 255) nounwind + ret i32 %m +} + +define i1 @length255_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length255_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $255 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length255_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length255_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $255 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length255_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length255_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $255 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length255_eq_const(i8* %X) nounwind { +; X86-LABEL: length255_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $255 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 255) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length256(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length256: +; X86: # %bb.0: +; X86-NEXT: pushl $256 # imm = 0x100 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 256) nounwind + ret i32 %m +} + +define i1 @length256_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length256_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $256 # imm = 0x100 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length256_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length256_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $256 # imm = 0x100 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length256_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length256_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $256 # imm = 0x100 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length256_eq_const(i8* %X) nounwind { +; X86-LABEL: length256_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $256 # imm = 0x100 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 256) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length384(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length384: +; X86: # %bb.0: +; X86-NEXT: pushl $384 # imm = 0x180 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 384) nounwind + ret i32 %m +} + +define i1 @length384_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length384_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $384 # imm = 0x180 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length384_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length384_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $384 # imm = 0x180 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length384_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length384_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $384 # imm = 0x180 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length384_eq_const(i8* %X) nounwind { +; X86-LABEL: length384_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $384 # imm = 0x180 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 384) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length511(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length511: +; X86: # %bb.0: +; X86-NEXT: pushl $511 # imm = 0x1FF +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 511) nounwind + ret i32 %m +} + +define i1 @length511_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length511_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $511 # imm = 0x1FF +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length511_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length511_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $511 # imm = 0x1FF +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length511_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length511_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $511 # imm = 0x1FF +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length511_eq_const(i8* %X) nounwind { +; X86-LABEL: length511_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $511 # imm = 0x1FF +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 511) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length512(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length512: +; X86: # %bb.0: +; X86-NEXT: pushl $512 # imm = 0x200 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 512) nounwind + ret i32 %m +} + +define i1 @length512_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length512_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $512 # imm = 0x200 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length512_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length512_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $512 # imm = 0x200 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length512_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length512_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $512 # imm = 0x200 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length512_eq_const(i8* %X) nounwind { +; X86-LABEL: length512_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $512 # imm = 0x200 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 512) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; This checks that we do not do stupid things with huge sizes. +define i32 @huge_length(i8* %X, i8* %Y) nounwind { +; X86-LABEL: huge_length: +; X86: # %bb.0: +; X86-NEXT: pushl $-1 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9223372036854775807) nounwind + ret i32 %m +} + +define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: huge_length_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $-1 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9223372036854775807) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; This checks non-constant sizes. +define i32 @nonconst_length(i8* %X, i8* %Y, i32 %size) nounwind { +; X86-LABEL: nonconst_length: +; X86: # %bb.0: +; X86-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 %size) nounwind + ret i32 %m +} + +define i1 @nonconst_length_eq(i8* %X, i8* %Y, i32 %size) nounwind { +; X86-LABEL: nonconst_length_eq: +; X86: # %bb.0: +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 %size) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll index 84b454776c325..9fc03d421bea4 100644 --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll @@ -1,9 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way. -; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE -; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86,X86-SSE1 -; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2 -; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86,X86-SSE41 ; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2 ; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41 ; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 @@ -23,11 +19,6 @@ declare dso_local i32 @memcmp(i8*, i8*, i64) define i32 @length0(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length0: -; X86: # %bb.0: -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: retl -; ; X64-LABEL: length0: ; X64: # %bb.0: ; X64-NEXT: xorl %eax, %eax @@ -37,11 +28,6 @@ define i32 @length0(i8* %X, i8* %Y) nounwind { } define i1 @length0_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length0_eq: -; X86: # %bb.0: -; X86-NEXT: movb $1, %al -; X86-NEXT: retl -; ; X64-LABEL: length0_eq: ; X64: # %bb.0: ; X64-NEXT: movb $1, %al @@ -52,11 +38,6 @@ define i1 @length0_eq(i8* %X, i8* %Y) nounwind { } define i1 @length0_lt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length0_lt: -; X86: # %bb.0: -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: retl -; ; X64-LABEL: length0_lt: ; X64: # %bb.0: ; X64-NEXT: xorl %eax, %eax @@ -67,19 +48,6 @@ define i1 @length0_lt(i8* %X, i8* %Y) nounwind { } define i32 @length2(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length2: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %dx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl %dx, %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: retl -; ; X64-LABEL: length2: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -95,15 +63,6 @@ define i32 @length2(i8* %X, i8* %Y) nounwind { } define i1 @length2_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length2_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: cmpw (%eax), %cx -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -116,21 +75,6 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind { } define i1 @length2_lt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length2_lt: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %dx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl %dx, %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length2_lt: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -149,21 +93,6 @@ define i1 @length2_lt(i8* %X, i8* %Y) nounwind { } define i1 @length2_gt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length2_gt: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %eax -; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %ax -; X86-NEXT: movzwl %cx, %ecx -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length2_gt: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -182,14 +111,6 @@ define i1 @length2_gt(i8* %X, i8* %Y) nounwind { } define i1 @length2_eq_const(i8* %X) nounwind { -; X86-LABEL: length2_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %eax -; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq_const: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -202,18 +123,6 @@ define i1 @length2_eq_const(i8* %X) nounwind { } define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length2_eq_nobuiltin_attr: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $2 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq_nobuiltin_attr: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -229,30 +138,6 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind { } define i32 @length3(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length3: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: movzwl (%ecx), %esi -; X86-NEXT: rolw $8, %dx -; X86-NEXT: rolw $8, %si -; X86-NEXT: cmpw %si, %dx -; X86-NEXT: jne .LBB9_3 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movzbl 2(%eax), %eax -; X86-NEXT: movzbl 2(%ecx), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; X86-NEXT: .LBB9_3: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length3: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -276,19 +161,6 @@ define i32 @length3(i8* %X, i8* %Y) nounwind { } define i1 @length3_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length3_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %edx -; X86-NEXT: xorw (%eax), %dx -; X86-NEXT: movb 2(%ecx), %cl -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orw %dx, %ax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length3_eq: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -305,20 +177,6 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind { } define i32 @length4(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length4: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax -; X86-NEXT: retl -; ; X64-LABEL: length4: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %ecx @@ -335,15 +193,6 @@ define i32 @length4(i8* %X, i8* %Y) nounwind { } define i1 @length4_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length4_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: cmpl (%eax), %ecx -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length4_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -356,22 +205,6 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind { } define i1 @length4_lt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length4_lt: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length4_lt: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %ecx @@ -391,22 +224,6 @@ define i1 @length4_lt(i8* %X, i8* %Y) nounwind { } define i1 @length4_gt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length4_gt: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %eax -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %eax, %ecx -; X86-NEXT: seta %dl -; X86-NEXT: sbbl $0, %edx -; X86-NEXT: testl %edx, %edx -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length4_gt: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -426,13 +243,6 @@ define i1 @length4_gt(i8* %X, i8* %Y) nounwind { } define i1 @length4_eq_const(i8* %X) nounwind { -; X86-LABEL: length4_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length4_eq_const: ; X64: # %bb.0: ; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 @@ -444,30 +254,6 @@ define i1 @length4_eq_const(i8* %X) nounwind { } define i32 @length5(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length5: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl (%ecx), %esi -; X86-NEXT: bswapl %edx -; X86-NEXT: bswapl %esi -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: jne .LBB16_3 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movzbl 4(%eax), %eax -; X86-NEXT: movzbl 4(%ecx), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; X86-NEXT: .LBB16_3: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length5: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -491,19 +277,6 @@ define i32 @length5(i8* %X, i8* %Y) nounwind { } define i1 @length5_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length5_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: movb 4(%ecx), %cl -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length5_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -520,32 +293,6 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { } define i1 @length5_lt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length5_lt: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl (%ecx), %esi -; X86-NEXT: bswapl %edx -; X86-NEXT: bswapl %esi -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: jne .LBB18_3 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movzbl 4(%eax), %eax -; X86-NEXT: movzbl 4(%ecx), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: jmp .LBB18_2 -; X86-NEXT: .LBB18_3: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB18_2: # %endblock -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length5_lt: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -574,34 +321,6 @@ define i1 @length5_lt(i8* %X, i8* %Y) nounwind { } define i32 @length7(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length7: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB19_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 3(%esi), %ecx -; X86-NEXT: movl 3(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB19_3 -; X86-NEXT: .LBB19_2: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB19_3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length7: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %ecx @@ -630,18 +349,6 @@ define i32 @length7(i8* %X, i8* %Y) nounwind { } define i1 @length7_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length7_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 3(%ecx), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 3(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length7_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -657,36 +364,6 @@ define i1 @length7_eq(i8* %X, i8* %Y) nounwind { } define i1 @length7_lt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length7_lt: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB21_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 3(%esi), %ecx -; X86-NEXT: movl 3(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB21_3 -; X86-NEXT: .LBB21_2: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB21_3: # %endblock -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length7_lt: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %ecx @@ -718,34 +395,6 @@ define i1 @length7_lt(i8* %X, i8* %Y) nounwind { } define i32 @length8(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length8: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB22_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB22_3 -; X86-NEXT: .LBB22_2: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB22_3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length8: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -762,18 +411,6 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { } define i1 @length8_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length8_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length8_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -786,17 +423,6 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { } define i1 @length8_eq_const(i8* %X) nounwind { -; X86-LABEL: length8_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 -; X86-NEXT: xorl (%eax), %ecx -; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 -; X86-NEXT: xorl 4(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length8_eq_const: ; X64: # %bb.0: ; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 @@ -809,24 +435,6 @@ define i1 @length8_eq_const(i8* %X) nounwind { } define i1 @length9_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length9_eq: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movb 8(%ecx), %cl -; X86-NEXT: xorb 8(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: sete %al -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length9_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -843,24 +451,6 @@ define i1 @length9_eq(i8* %X, i8* %Y) nounwind { } define i1 @length10_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length10_eq: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movzwl 8(%ecx), %ecx -; X86-NEXT: xorw 8(%eax), %cx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: sete %al -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length10_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -877,23 +467,6 @@ define i1 @length10_eq(i8* %X, i8* %Y) nounwind { } define i1 @length11_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length11_eq: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl 7(%ecx), %ecx -; X86-NEXT: xorl 7(%eax), %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: sete %al -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length11_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -909,23 +482,6 @@ define i1 @length11_eq(i8* %X, i8* %Y) nounwind { } define i1 @length12_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length12_eq: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl 8(%ecx), %ecx -; X86-NEXT: xorl 8(%eax), %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: setne %al -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length12_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -941,41 +497,6 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { } define i32 @length12(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length12: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB29_3 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB29_3 -; X86-NEXT: # %bb.2: # %loadbb2 -; X86-NEXT: movl 8(%esi), %ecx -; X86-NEXT: movl 8(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB29_4 -; X86-NEXT: .LBB29_3: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB29_4: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length12: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -1004,27 +525,6 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { } define i1 @length13_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length13_eq: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl 8(%ecx), %edx -; X86-NEXT: xorl 8(%eax), %edx -; X86-NEXT: movb 12(%ecx), %cl -; X86-NEXT: xorb 12(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: sete %al -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length13_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -1040,27 +540,6 @@ define i1 @length13_eq(i8* %X, i8* %Y) nounwind { } define i1 @length14_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length14_eq: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl 8(%ecx), %edx -; X86-NEXT: xorl 8(%eax), %edx -; X86-NEXT: movzwl 12(%ecx), %ecx -; X86-NEXT: xorw 12(%eax), %cx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: sete %al -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length14_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -1076,26 +555,6 @@ define i1 @length14_eq(i8* %X, i8* %Y) nounwind { } define i1 @length15_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length15_eq: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %esi -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl 8(%ecx), %edx -; X86-NEXT: xorl 8(%eax), %edx -; X86-NEXT: movl 11(%ecx), %ecx -; X86-NEXT: xorl 11(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: sete %al -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length15_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -1113,48 +572,6 @@ define i1 @length15_eq(i8* %X, i8* %Y) nounwind { ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 define i32 @length16(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length16: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB33_4 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB33_4 -; X86-NEXT: # %bb.2: # %loadbb2 -; X86-NEXT: movl 8(%esi), %ecx -; X86-NEXT: movl 8(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB33_4 -; X86-NEXT: # %bb.3: # %loadbb3 -; X86-NEXT: movl 12(%esi), %ecx -; X86-NEXT: movl 12(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB33_5 -; X86-NEXT: .LBB33_4: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB33_5: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length16: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -1183,69 +600,6 @@ define i32 @length16(i8* %X, i8* %Y) nounwind { } define i1 @length16_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length16_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl (%ecx), %edx -; X86-NOSSE-NEXT: movl 4(%ecx), %esi -; X86-NOSSE-NEXT: xorl (%eax), %edx -; X86-NOSSE-NEXT: xorl 4(%eax), %esi -; X86-NOSSE-NEXT: orl %edx, %esi -; X86-NOSSE-NEXT: movl 8(%ecx), %edx -; X86-NOSSE-NEXT: xorl 8(%eax), %edx -; X86-NOSSE-NEXT: movl 12(%ecx), %ecx -; X86-NOSSE-NEXT: xorl 12(%eax), %ecx -; X86-NOSSE-NEXT: orl %edx, %ecx -; X86-NOSSE-NEXT: orl %esi, %ecx -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: popl %esi -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length16_eq: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl %esi -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE1-NEXT: movl (%ecx), %edx -; X86-SSE1-NEXT: movl 4(%ecx), %esi -; X86-SSE1-NEXT: xorl (%eax), %edx -; X86-SSE1-NEXT: xorl 4(%eax), %esi -; X86-SSE1-NEXT: orl %edx, %esi -; X86-SSE1-NEXT: movl 8(%ecx), %edx -; X86-SSE1-NEXT: xorl 8(%eax), %edx -; X86-SSE1-NEXT: movl 12(%ecx), %ecx -; X86-SSE1-NEXT: xorl 12(%eax), %ecx -; X86-SSE1-NEXT: orl %edx, %ecx -; X86-SSE1-NEXT: orl %esi, %ecx -; X86-SSE1-NEXT: setne %al -; X86-SSE1-NEXT: popl %esi -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu (%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length16_eq: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu (%eax), %xmm1 -; X86-SSE41-NEXT: pxor %xmm0, %xmm1 -; X86-SSE41-NEXT: ptest %xmm1, %xmm1 -; X86-SSE41-NEXT: setne %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length16_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -1288,50 +642,6 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind { } define i1 @length16_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length16_lt: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB35_4 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB35_4 -; X86-NEXT: # %bb.2: # %loadbb2 -; X86-NEXT: movl 8(%esi), %ecx -; X86-NEXT: movl 8(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB35_4 -; X86-NEXT: # %bb.3: # %loadbb3 -; X86-NEXT: movl 12(%esi), %ecx -; X86-NEXT: movl 12(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB35_5 -; X86-NEXT: .LBB35_4: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB35_5: # %endblock -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length16_lt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -1363,50 +673,6 @@ define i1 @length16_lt(i8* %x, i8* %y) nounwind { } define i1 @length16_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length16_gt: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: movl (%edx), %ecx -; X86-NEXT: bswapl %eax -; X86-NEXT: bswapl %ecx -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: jne .LBB36_4 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%esi), %eax -; X86-NEXT: movl 4(%edx), %ecx -; X86-NEXT: bswapl %eax -; X86-NEXT: bswapl %ecx -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: jne .LBB36_4 -; X86-NEXT: # %bb.2: # %loadbb2 -; X86-NEXT: movl 8(%esi), %eax -; X86-NEXT: movl 8(%edx), %ecx -; X86-NEXT: bswapl %eax -; X86-NEXT: bswapl %ecx -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: jne .LBB36_4 -; X86-NEXT: # %bb.3: # %loadbb3 -; X86-NEXT: movl 12(%esi), %eax -; X86-NEXT: movl 12(%edx), %ecx -; X86-NEXT: bswapl %eax -; X86-NEXT: bswapl %ecx -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: je .LBB36_5 -; X86-NEXT: .LBB36_4: # %res_block -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: setae %dl -; X86-NEXT: leal -1(%edx,%edx), %edx -; X86-NEXT: .LBB36_5: # %endblock -; X86-NEXT: testl %edx, %edx -; X86-NEXT: setg %al -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length16_gt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -1438,63 +704,6 @@ define i1 @length16_gt(i8* %x, i8* %y) nounwind { } define i1 @length16_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length16_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl $858927408, %ecx # imm = 0x33323130 -; X86-NOSSE-NEXT: xorl (%eax), %ecx -; X86-NOSSE-NEXT: movl $926299444, %edx # imm = 0x37363534 -; X86-NOSSE-NEXT: xorl 4(%eax), %edx -; X86-NOSSE-NEXT: orl %ecx, %edx -; X86-NOSSE-NEXT: movl $825243960, %ecx # imm = 0x31303938 -; X86-NOSSE-NEXT: xorl 8(%eax), %ecx -; X86-NOSSE-NEXT: movl $892613426, %esi # imm = 0x35343332 -; X86-NOSSE-NEXT: xorl 12(%eax), %esi -; X86-NOSSE-NEXT: orl %ecx, %esi -; X86-NOSSE-NEXT: orl %edx, %esi -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: popl %esi -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length16_eq_const: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl %esi -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE1-NEXT: movl $858927408, %ecx # imm = 0x33323130 -; X86-SSE1-NEXT: xorl (%eax), %ecx -; X86-SSE1-NEXT: movl $926299444, %edx # imm = 0x37363534 -; X86-SSE1-NEXT: xorl 4(%eax), %edx -; X86-SSE1-NEXT: orl %ecx, %edx -; X86-SSE1-NEXT: movl $825243960, %ecx # imm = 0x31303938 -; X86-SSE1-NEXT: xorl 8(%eax), %ecx -; X86-SSE1-NEXT: movl $892613426, %esi # imm = 0x35343332 -; X86-SSE1-NEXT: xorl 12(%eax), %esi -; X86-SSE1-NEXT: orl %ecx, %esi -; X86-SSE1-NEXT: orl %edx, %esi -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: popl %esi -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length16_eq_const: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length16_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -1537,16 +746,6 @@ define i1 @length16_eq_const(i8* %X) nounwind { ; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 define i32 @length24(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length24: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length24: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -1582,61 +781,6 @@ define i32 @length24(i8* %X, i8* %Y) nounwind { } define i1 @length24_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length24_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length24_eq: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $24 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length24_eq: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 8(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 8(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length24_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -1694,18 +838,6 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { } define i1 @length24_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length24_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length24_lt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -1744,18 +876,6 @@ define i1 @length24_lt(i8* %x, i8* %y) nounwind { } define i1 @length24_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length24_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length24_gt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -1794,55 +914,6 @@ define i1 @length24_gt(i8* %x, i8* %y) nounwind { } define i1 @length24_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length24_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length24_eq_const: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $24 -; X86-SSE1-NEXT: pushl $.L.str -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: setne %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length24_eq_const: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: setne %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -1895,16 +966,6 @@ define i1 @length24_eq_const(i8* %X) nounwind { } define i32 @length31(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length31: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $31 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length31: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -1947,61 +1008,6 @@ define i32 @length31(i8* %X, i8* %Y) nounwind { } define i1 @length31_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length31_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $31 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length31_eq: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $31 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length31_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 15(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length31_eq: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 15(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length31_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -2058,18 +1064,6 @@ define i1 @length31_eq(i8* %x, i8* %y) nounwind { } define i1 @length31_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length31_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $31 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length31_lt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -2115,18 +1109,6 @@ define i1 @length31_lt(i8* %x, i8* %y) nounwind { } define i1 @length31_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length31_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $31 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length31_gt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -2172,61 +1154,6 @@ define i1 @length31_gt(i8* %x, i8* %y) nounwind { } define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" { -; X86-NOSSE-LABEL: length31_eq_prefer128: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $31 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length31_eq_prefer128: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $31 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length31_eq_prefer128: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 15(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length31_eq_prefer128: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 15(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length31_eq_prefer128: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -2283,55 +1210,6 @@ define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"= } define i1 @length31_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length31_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $31 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length31_eq_const: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $31 -; X86-SSE1-NEXT: pushl $.L.str -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: setne %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length31_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 15(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length31_eq_const: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 15(%eax), %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: setne %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length31_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -2384,16 +1262,6 @@ define i1 @length31_eq_const(i8* %X) nounwind { } define i32 @length32(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length32: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length32: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -2438,61 +1306,6 @@ define i32 @length32(i8* %X, i8* %Y) nounwind { ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 define i1 @length32_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length32_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $32 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length32_eq: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $32 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length32_eq: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length32_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -2562,18 +1375,6 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { } define i1 @length32_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length32_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length32_lt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -2619,18 +1420,6 @@ define i1 @length32_lt(i8* %x, i8* %y) nounwind { } define i1 @length32_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length32_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length32_gt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -2676,61 +1465,6 @@ define i1 @length32_gt(i8* %x, i8* %y) nounwind { } define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" { -; X86-NOSSE-LABEL: length32_eq_prefer128: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $32 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length32_eq_prefer128: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $32 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length32_eq_prefer128: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length32_eq_prefer128: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length32_eq_prefer128: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -2787,55 +1521,6 @@ define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"= } define i1 @length32_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length32_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $32 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length32_eq_const: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $32 -; X86-SSE1-NEXT: pushl $.L.str -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: setne %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length32_eq_const: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: setne %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length32_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -2901,16 +1586,6 @@ define i1 @length32_eq_const(i8* %X) nounwind { } define i32 @length48(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length48: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $48 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length48: ; X64: # %bb.0: ; X64-NEXT: movl $48, %edx @@ -2920,69 +1595,6 @@ define i32 @length48(i8* %X, i8* %Y) nounwind { } define i1 @length48_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length48_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $48 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length48_eq: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $48 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length48_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 -; X86-SSE2-NEXT: pand %xmm0, %xmm3 -; X86-SSE2-NEXT: pand %xmm2, %xmm3 -; X86-SSE2-NEXT: pmovmskb %xmm3, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length48_eq: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE41-NEXT: pxor %xmm1, %xmm3 -; X86-SSE41-NEXT: por %xmm0, %xmm3 -; X86-SSE41-NEXT: por %xmm2, %xmm3 -; X86-SSE41-NEXT: ptest %xmm3, %xmm3 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length48_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -3075,18 +1687,6 @@ define i1 @length48_eq(i8* %x, i8* %y) nounwind { } define i1 @length48_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length48_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $48 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length48_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -3102,18 +1702,6 @@ define i1 @length48_lt(i8* %x, i8* %y) nounwind { } define i1 @length48_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length48_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $48 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length48_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -3129,69 +1717,6 @@ define i1 @length48_gt(i8* %x, i8* %y) nounwind { } define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" { -; X86-NOSSE-LABEL: length48_eq_prefer128: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $48 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length48_eq_prefer128: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $48 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length48_eq_prefer128: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 -; X86-SSE2-NEXT: pand %xmm0, %xmm3 -; X86-SSE2-NEXT: pand %xmm2, %xmm3 -; X86-SSE2-NEXT: pmovmskb %xmm3, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length48_eq_prefer128: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE41-NEXT: pxor %xmm1, %xmm3 -; X86-SSE41-NEXT: por %xmm0, %xmm3 -; X86-SSE41-NEXT: por %xmm2, %xmm3 -; X86-SSE41-NEXT: ptest %xmm3, %xmm3 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length48_eq_prefer128: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -3263,61 +1788,6 @@ define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"= } define i1 @length48_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length48_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $48 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length48_eq_const: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $48 -; X86-SSE1-NEXT: pushl $.L.str -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: setne %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length48_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pand %xmm0, %xmm2 -; X86-SSE2-NEXT: pmovmskb %xmm2, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length48_eq_const: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE41-NEXT: por %xmm1, %xmm2 -; X86-SSE41-NEXT: por %xmm0, %xmm2 -; X86-SSE41-NEXT: ptest %xmm2, %xmm2 -; X86-SSE41-NEXT: setne %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length48_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -3401,16 +1871,6 @@ define i1 @length48_eq_const(i8* %X) nounwind { } define i32 @length63(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length63: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $63 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length63: ; X64: # %bb.0: ; X64-NEXT: movl $63, %edx @@ -3420,77 +1880,6 @@ define i32 @length63(i8* %X, i8* %Y) nounwind { } define i1 @length63_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length63_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $63 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length63_eq: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $63 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: setne %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length63_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 -; X86-SSE2-NEXT: movdqu 47(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu 47(%eax), %xmm4 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm4 -; X86-SSE2-NEXT: pand %xmm3, %xmm4 -; X86-SSE2-NEXT: pand %xmm0, %xmm4 -; X86-SSE2-NEXT: pand %xmm2, %xmm4 -; X86-SSE2-NEXT: pmovmskb %xmm4, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length63_eq: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE41-NEXT: pxor %xmm1, %xmm3 -; X86-SSE41-NEXT: movdqu 47(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu 47(%eax), %xmm4 -; X86-SSE41-NEXT: pxor %xmm1, %xmm4 -; X86-SSE41-NEXT: por %xmm3, %xmm4 -; X86-SSE41-NEXT: por %xmm0, %xmm4 -; X86-SSE41-NEXT: por %xmm2, %xmm4 -; X86-SSE41-NEXT: ptest %xmm4, %xmm4 -; X86-SSE41-NEXT: setne %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length63_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -3588,18 +1977,6 @@ define i1 @length63_eq(i8* %x, i8* %y) nounwind { } define i1 @length63_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length63_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $63 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length63_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -3615,18 +1992,6 @@ define i1 @length63_lt(i8* %x, i8* %y) nounwind { } define i1 @length63_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length63_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $63 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length63_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -3642,67 +2007,6 @@ define i1 @length63_gt(i8* %x, i8* %y) nounwind { } define i1 @length63_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length63_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $63 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length63_eq_const: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $63 -; X86-SSE1-NEXT: pushl $.L.str -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length63_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 -; X86-SSE2-NEXT: movdqu 47(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: pand %xmm3, %xmm2 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length63_eq_const: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 -; X86-SSE41-NEXT: movdqu 47(%eax), %xmm3 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE41-NEXT: por %xmm3, %xmm2 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE41-NEXT: por %xmm2, %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length63_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -3792,16 +2096,6 @@ define i1 @length63_eq_const(i8* %X) nounwind { } define i32 @length64(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length64: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length64: ; X64: # %bb.0: ; X64-NEXT: movl $64, %edx @@ -3811,77 +2105,6 @@ define i32 @length64(i8* %X, i8* %Y) nounwind { } define i1 @length64_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length64_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $64 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length64_eq: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $64 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: setne %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length64_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 -; X86-SSE2-NEXT: movdqu 48(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu 48(%eax), %xmm4 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm4 -; X86-SSE2-NEXT: pand %xmm3, %xmm4 -; X86-SSE2-NEXT: pand %xmm0, %xmm4 -; X86-SSE2-NEXT: pand %xmm2, %xmm4 -; X86-SSE2-NEXT: pmovmskb %xmm4, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length64_eq: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE41-NEXT: pxor %xmm1, %xmm3 -; X86-SSE41-NEXT: movdqu 48(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu 48(%eax), %xmm4 -; X86-SSE41-NEXT: pxor %xmm1, %xmm4 -; X86-SSE41-NEXT: por %xmm3, %xmm4 -; X86-SSE41-NEXT: por %xmm0, %xmm4 -; X86-SSE41-NEXT: por %xmm2, %xmm4 -; X86-SSE41-NEXT: ptest %xmm4, %xmm4 -; X86-SSE41-NEXT: setne %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length64_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -3994,18 +2217,6 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind { } define i1 @length64_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length64_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length64_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4021,18 +2232,6 @@ define i1 @length64_lt(i8* %x, i8* %y) nounwind { } define i1 @length64_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length64_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length64_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4048,67 +2247,6 @@ define i1 @length64_gt(i8* %x, i8* %y) nounwind { } define i1 @length64_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length64_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $64 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length64_eq_const: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $64 -; X86-SSE1-NEXT: pushl $.L.str -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length64_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 -; X86-SSE2-NEXT: movdqu 48(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: pand %xmm3, %xmm2 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length64_eq_const: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 -; X86-SSE41-NEXT: movdqu 48(%eax), %xmm3 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE41-NEXT: por %xmm3, %xmm2 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE41-NEXT: por %xmm2, %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length64_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -4213,16 +2351,6 @@ define i1 @length64_eq_const(i8* %X) nounwind { } define i32 @length96(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length96: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $96 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length96: ; X64: # %bb.0: ; X64-NEXT: movl $96, %edx @@ -4232,18 +2360,6 @@ define i32 @length96(i8* %X, i8* %Y) nounwind { } define i1 @length96_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length96_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $96 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length96_eq: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -4342,18 +2458,6 @@ define i1 @length96_eq(i8* %x, i8* %y) nounwind { } define i1 @length96_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length96_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $96 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length96_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4369,18 +2473,6 @@ define i1 @length96_lt(i8* %x, i8* %y) nounwind { } define i1 @length96_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length96_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $96 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length96_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4396,18 +2488,6 @@ define i1 @length96_gt(i8* %x, i8* %y) nounwind { } define i1 @length96_eq_const(i8* %X) nounwind { -; X86-LABEL: length96_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $96 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length96_eq_const: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -4504,16 +2584,6 @@ define i1 @length96_eq_const(i8* %X) nounwind { } define i32 @length127(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length127: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $127 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length127: ; X64: # %bb.0: ; X64-NEXT: movl $127, %edx @@ -4523,18 +2593,6 @@ define i32 @length127(i8* %X, i8* %Y) nounwind { } define i1 @length127_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length127_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $127 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length127_eq: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -4640,18 +2698,6 @@ define i1 @length127_eq(i8* %x, i8* %y) nounwind { } define i1 @length127_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length127_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $127 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length127_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4667,18 +2713,6 @@ define i1 @length127_lt(i8* %x, i8* %y) nounwind { } define i1 @length127_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length127_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $127 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length127_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4694,18 +2728,6 @@ define i1 @length127_gt(i8* %x, i8* %y) nounwind { } define i1 @length127_eq_const(i8* %X) nounwind { -; X86-LABEL: length127_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $127 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length127_eq_const: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -4812,16 +2834,6 @@ define i1 @length127_eq_const(i8* %X) nounwind { } define i32 @length128(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length128: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $128 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length128: ; X64: # %bb.0: ; X64-NEXT: movl $128, %edx @@ -4831,18 +2843,6 @@ define i32 @length128(i8* %X, i8* %Y) nounwind { } define i1 @length128_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length128_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $128 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length128_eq: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -4948,18 +2948,6 @@ define i1 @length128_eq(i8* %x, i8* %y) nounwind { } define i1 @length128_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length128_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $128 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length128_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4975,18 +2963,6 @@ define i1 @length128_lt(i8* %x, i8* %y) nounwind { } define i1 @length128_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length128_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $128 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length128_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -5002,18 +2978,6 @@ define i1 @length128_gt(i8* %x, i8* %y) nounwind { } define i1 @length128_eq_const(i8* %X) nounwind { -; X86-LABEL: length128_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $128 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length128_eq_const: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -5120,16 +3084,6 @@ define i1 @length128_eq_const(i8* %X) nounwind { } define i32 @length192(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length192: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $192 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length192: ; X64: # %bb.0: ; X64-NEXT: movl $192, %edx @@ -5139,18 +3093,6 @@ define i32 @length192(i8* %X, i8* %Y) nounwind { } define i1 @length192_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length192_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $192 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length192_eq: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -5238,18 +3180,6 @@ define i1 @length192_eq(i8* %x, i8* %y) nounwind { } define i1 @length192_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length192_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $192 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length192_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -5265,18 +3195,6 @@ define i1 @length192_lt(i8* %x, i8* %y) nounwind { } define i1 @length192_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length192_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $192 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length192_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -5292,18 +3210,6 @@ define i1 @length192_gt(i8* %x, i8* %y) nounwind { } define i1 @length192_eq_const(i8* %X) nounwind { -; X86-LABEL: length192_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $192 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length192_eq_const: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -5395,16 +3301,6 @@ define i1 @length192_eq_const(i8* %X) nounwind { } define i32 @length255(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length255: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $255 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length255: ; X64: # %bb.0: ; X64-NEXT: movl $255, %edx @@ -5414,18 +3310,6 @@ define i32 @length255(i8* %X, i8* %Y) nounwind { } define i1 @length255_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length255_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $255 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length255_eq: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -5522,18 +3406,6 @@ define i1 @length255_eq(i8* %x, i8* %y) nounwind { } define i1 @length255_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length255_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $255 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length255_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -5549,18 +3421,6 @@ define i1 @length255_lt(i8* %x, i8* %y) nounwind { } define i1 @length255_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length255_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $255 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length255_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -5576,18 +3436,6 @@ define i1 @length255_gt(i8* %x, i8* %y) nounwind { } define i1 @length255_eq_const(i8* %X) nounwind { -; X86-LABEL: length255_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $255 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length255_eq_const: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -5688,16 +3536,6 @@ define i1 @length255_eq_const(i8* %X) nounwind { } define i32 @length256(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length256: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $256 # imm = 0x100 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length256: ; X64: # %bb.0: ; X64-NEXT: movl $256, %edx # imm = 0x100 @@ -5707,18 +3545,6 @@ define i32 @length256(i8* %X, i8* %Y) nounwind { } define i1 @length256_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length256_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $256 # imm = 0x100 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length256_eq: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -5815,18 +3641,6 @@ define i1 @length256_eq(i8* %x, i8* %y) nounwind { } define i1 @length256_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length256_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $256 # imm = 0x100 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length256_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -5842,18 +3656,6 @@ define i1 @length256_lt(i8* %x, i8* %y) nounwind { } define i1 @length256_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length256_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $256 # imm = 0x100 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length256_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -5869,18 +3671,6 @@ define i1 @length256_gt(i8* %x, i8* %y) nounwind { } define i1 @length256_eq_const(i8* %X) nounwind { -; X86-LABEL: length256_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $256 # imm = 0x100 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length256_eq_const: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -5981,16 +3771,6 @@ define i1 @length256_eq_const(i8* %X) nounwind { } define i32 @length384(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length384: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $384 # imm = 0x180 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length384: ; X64: # %bb.0: ; X64-NEXT: movl $384, %edx # imm = 0x180 @@ -6000,18 +3780,6 @@ define i32 @length384(i8* %X, i8* %Y) nounwind { } define i1 @length384_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length384_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $384 # imm = 0x180 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length384_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6027,18 +3795,6 @@ define i1 @length384_eq(i8* %x, i8* %y) nounwind { } define i1 @length384_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length384_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $384 # imm = 0x180 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length384_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6054,18 +3810,6 @@ define i1 @length384_lt(i8* %x, i8* %y) nounwind { } define i1 @length384_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length384_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $384 # imm = 0x180 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length384_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6081,18 +3825,6 @@ define i1 @length384_gt(i8* %x, i8* %y) nounwind { } define i1 @length384_eq_const(i8* %X) nounwind { -; X86-LABEL: length384_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $384 # imm = 0x180 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length384_eq_const: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6109,16 +3841,6 @@ define i1 @length384_eq_const(i8* %X) nounwind { } define i32 @length511(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length511: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $511 # imm = 0x1FF -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length511: ; X64: # %bb.0: ; X64-NEXT: movl $511, %edx # imm = 0x1FF @@ -6128,18 +3850,6 @@ define i32 @length511(i8* %X, i8* %Y) nounwind { } define i1 @length511_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length511_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $511 # imm = 0x1FF -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length511_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6155,18 +3865,6 @@ define i1 @length511_eq(i8* %x, i8* %y) nounwind { } define i1 @length511_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length511_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $511 # imm = 0x1FF -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length511_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6182,18 +3880,6 @@ define i1 @length511_lt(i8* %x, i8* %y) nounwind { } define i1 @length511_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length511_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $511 # imm = 0x1FF -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length511_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6209,18 +3895,6 @@ define i1 @length511_gt(i8* %x, i8* %y) nounwind { } define i1 @length511_eq_const(i8* %X) nounwind { -; X86-LABEL: length511_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $511 # imm = 0x1FF -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length511_eq_const: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6237,16 +3911,6 @@ define i1 @length511_eq_const(i8* %X) nounwind { } define i32 @length512(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length512: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $512 # imm = 0x200 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length512: ; X64: # %bb.0: ; X64-NEXT: movl $512, %edx # imm = 0x200 @@ -6256,18 +3920,6 @@ define i32 @length512(i8* %X, i8* %Y) nounwind { } define i1 @length512_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length512_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $512 # imm = 0x200 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length512_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6283,18 +3935,6 @@ define i1 @length512_eq(i8* %x, i8* %y) nounwind { } define i1 @length512_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length512_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $512 # imm = 0x200 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length512_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6310,18 +3950,6 @@ define i1 @length512_lt(i8* %x, i8* %y) nounwind { } define i1 @length512_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length512_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $512 # imm = 0x200 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length512_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6337,18 +3965,6 @@ define i1 @length512_gt(i8* %x, i8* %y) nounwind { } define i1 @length512_eq_const(i8* %X) nounwind { -; X86-LABEL: length512_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $512 # imm = 0x200 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length512_eq_const: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6366,16 +3982,6 @@ define i1 @length512_eq_const(i8* %X) nounwind { ; This checks that we do not do stupid things with huge sizes. define i32 @huge_length(i8* %X, i8* %Y) nounwind { -; X86-LABEL: huge_length: -; X86: # %bb.0: -; X86-NEXT: pushl $2147483647 # imm = 0x7FFFFFFF -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: huge_length: ; X64: # %bb.0: ; X64-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF @@ -6385,18 +3991,6 @@ define i32 @huge_length(i8* %X, i8* %Y) nounwind { } define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: huge_length_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $2147483647 # imm = 0x7FFFFFFF -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: huge_length_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -6413,10 +4007,6 @@ define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind { ; This checks non-constant sizes. define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind { -; X86-LABEL: nonconst_length: -; X86: # %bb.0: -; X86-NEXT: jmp memcmp # TAILCALL -; ; X64-LABEL: nonconst_length: ; X64: # %bb.0: ; X64-NEXT: jmp memcmp # TAILCALL @@ -6425,18 +4015,6 @@ define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind { } define i1 @nonconst_length_eq(i8* %X, i8* %Y, i64 %size) nounwind { -; X86-LABEL: nonconst_length_eq: -; X86: # %bb.0: -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: nonconst_length_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll new file mode 100644 index 0000000000000..2c45b8510d266 --- /dev/null +++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll @@ -0,0 +1,584 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 + +; This tests codegen time inlining/optimization of memcmp +; rdar://6480398 + +@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1 + +declare dso_local i32 @memcmp(i8*, i8*, i32) +declare dso_local i32 @bcmp(i8*, i8*, i32) + +define i32 @length2(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + ret i32 %m +} + +define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length2_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length2_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 2) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length2_eq_nobuiltin_attr: +; X86: # %bb.0: +; X86-NEXT: pushl $2 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length3(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length3: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: rolw $8, %dx +; X86-NEXT: rolw $8, %si +; X86-NEXT: cmpw %si, %dx +; X86-NEXT: jne .LBB4_3 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB4_2 +; X86-NEXT: .LBB4_3: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB4_2: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind + ret i32 %m +} + +define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length3_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: xorw (%eax), %dx +; X86-NEXT: movb 2(%ecx), %cl +; X86-NEXT: xorb 2(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orw %dx, %ax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length4(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length4: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: seta %al +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + ret i32 %m +} + +define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length4_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length4_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length4_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 4) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length5(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length5: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB9_3 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB9_2 +; X86-NEXT: .LBB9_3: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB9_2: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind + ret i32 %m +} + +define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length5_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: movb 4(%ecx), %cl +; X86-NEXT: xorb 4(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length8(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length8: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_2 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB11_3 +; X86-NEXT: .LBB11_2: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB11_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind + ret i32 %m +} + +define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length8_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length8_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length8_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 +; X86-NEXT: xorl (%eax), %ecx +; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 +; X86-NEXT: xorl 4(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 8) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length12_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length12(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length12: +; X86: # %bb.0: +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind + ret i32 %m +} + +; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 + +define i32 @length16(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length16: +; X86: # %bb.0: +; X86-NEXT: pushl $16 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 16) nounwind + ret i32 %m +} + +define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize { +; X86-NOSSE-LABEL: length16_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_eq_const(i8* %X) nounwind optsize { +; X86-NOSSE-LABEL: length16_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 16) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 + +define i32 @length24(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length24: +; X86: # %bb.0: +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 24) nounwind + ret i32 %m +} + +define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { +; X86-NOSSE-LABEL: length24_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length24_eq_const(i8* %X) nounwind optsize { +; X86-NOSSE-LABEL: length24_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 24) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length32(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length32: +; X86: # %bb.0: +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 32) nounwind + ret i32 %m +} + +; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 + +define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { +; X86-NOSSE-LABEL: length32_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_const(i8* %X) nounwind optsize { +; X86-NOSSE-LABEL: length32_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 32) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length64(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: length64: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 64) nounwind + ret i32 %m +} + +define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { +; X86-LABEL: length64_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_eq_const(i8* %X) nounwind optsize { +; X86-LABEL: length64_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 64) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind optsize { +; X86-LABEL: bcmp_length2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: retl + %m = tail call i32 @bcmp(i8* %X, i8* %Y, i32 2) nounwind + ret i32 %m +} diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll index ab5cc93ca41c2..4c5b339859719 100644 --- a/llvm/test/CodeGen/X86/memcmp-optsize.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll @@ -1,6 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 @@ -14,19 +12,6 @@ declare dso_local i32 @memcmp(i8*, i8*, i64) declare dso_local i32 @bcmp(i8*, i8*, i64) define i32 @length2(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length2: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %dx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl %dx, %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: retl -; ; X64-LABEL: length2: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -42,15 +27,6 @@ define i32 @length2(i8* %X, i8* %Y) nounwind optsize { } define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length2_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: cmpw (%eax), %cx -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -63,14 +39,6 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize { } define i1 @length2_eq_const(i8* %X) nounwind optsize { -; X86-LABEL: length2_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %eax -; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq_const: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -83,18 +51,6 @@ define i1 @length2_eq_const(i8* %X) nounwind optsize { } define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length2_eq_nobuiltin_attr: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $2 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq_nobuiltin_attr: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -110,30 +66,6 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize { } define i32 @length3(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length3: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: movzwl (%ecx), %esi -; X86-NEXT: rolw $8, %dx -; X86-NEXT: rolw $8, %si -; X86-NEXT: cmpw %si, %dx -; X86-NEXT: jne .LBB4_3 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movzbl 2(%eax), %eax -; X86-NEXT: movzbl 2(%ecx), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: jmp .LBB4_2 -; X86-NEXT: .LBB4_3: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB4_2: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length3: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -157,19 +89,6 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize { } define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length3_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %edx -; X86-NEXT: xorw (%eax), %dx -; X86-NEXT: movb 2(%ecx), %cl -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orw %dx, %ax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length3_eq: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -186,20 +105,6 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize { } define i32 @length4(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length4: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax -; X86-NEXT: retl -; ; X64-LABEL: length4: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %ecx @@ -216,15 +121,6 @@ define i32 @length4(i8* %X, i8* %Y) nounwind optsize { } define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length4_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: cmpl (%eax), %ecx -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length4_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -237,13 +133,6 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize { } define i1 @length4_eq_const(i8* %X) nounwind optsize { -; X86-LABEL: length4_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length4_eq_const: ; X64: # %bb.0: ; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 @@ -255,30 +144,6 @@ define i1 @length4_eq_const(i8* %X) nounwind optsize { } define i32 @length5(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length5: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl (%ecx), %esi -; X86-NEXT: bswapl %edx -; X86-NEXT: bswapl %esi -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: jne .LBB9_3 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movzbl 4(%eax), %eax -; X86-NEXT: movzbl 4(%ecx), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: jmp .LBB9_2 -; X86-NEXT: .LBB9_3: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB9_2: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length5: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -302,19 +167,6 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize { } define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length5_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: movb 4(%ecx), %cl -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length5_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -331,34 +183,6 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize { } define i32 @length8(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length8: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB11_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB11_3 -; X86-NEXT: .LBB11_2: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB11_3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length8: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -375,18 +199,6 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize { } define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length8_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length8_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -399,17 +211,6 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize { } define i1 @length8_eq_const(i8* %X) nounwind optsize { -; X86-LABEL: length8_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 -; X86-NEXT: xorl (%eax), %ecx -; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 -; X86-NEXT: xorl 4(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length8_eq_const: ; X64: # %bb.0: ; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 @@ -422,18 +223,6 @@ define i1 @length8_eq_const(i8* %X) nounwind optsize { } define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length12_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length12_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -449,16 +238,6 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize { } define i32 @length12(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length12: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length12: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -489,16 +268,6 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize { ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 define i32 @length16(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length16: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $16 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length16: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -527,30 +296,6 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize { } define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize { -; X86-NOSSE-LABEL: length16_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu (%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length16_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -574,28 +319,6 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize { } define i1 @length16_eq_const(i8* %X) nounwind optsize { -; X86-NOSSE-LABEL: length16_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length16_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -620,16 +343,6 @@ define i1 @length16_eq_const(i8* %X) nounwind optsize { ; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 define i32 @length24(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length24: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length24: ; X64: # %bb.0: ; X64-NEXT: movl $24, %edx @@ -639,34 +352,6 @@ define i32 @length24(i8* %X, i8* %Y) nounwind optsize { } define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { -; X86-NOSSE-LABEL: length24_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length24_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -698,31 +383,6 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize { } define i1 @length24_eq_const(i8* %X) nounwind optsize { -; X86-NOSSE-LABEL: length24_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -751,16 +411,6 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize { } define i32 @length32(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length32: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length32: ; X64: # %bb.0: ; X64-NEXT: movl $32, %edx @@ -772,34 +422,6 @@ define i32 @length32(i8* %X, i8* %Y) nounwind optsize { ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { -; X86-NOSSE-LABEL: length32_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $32 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length32_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -837,31 +459,6 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize { } define i1 @length32_eq_const(i8* %X) nounwind optsize { -; X86-NOSSE-LABEL: length32_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $32 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length32_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -897,16 +494,6 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize { } define i32 @length64(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: length64: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length64: ; X64: # %bb.0: ; X64-NEXT: movl $64, %edx @@ -916,18 +503,6 @@ define i32 @length64(i8* %X, i8* %Y) nounwind optsize { } define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { -; X86-LABEL: length64_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE2-LABEL: length64_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pushq %rax @@ -967,18 +542,6 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize { } define i1 @length64_eq_const(i8* %X) nounwind optsize { -; X86-LABEL: length64_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE2-LABEL: length64_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pushq %rax @@ -1019,19 +582,6 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize { } define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind optsize { -; X86-LABEL: bcmp_length2: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %dx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl %dx, %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: retl -; ; X64-LABEL: bcmp_length2: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -1045,4 +595,3 @@ define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind optsize { %m = tail call i32 @bcmp(i8* %X, i8* %Y, i64 2) nounwind ret i32 %m } - diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll new file mode 100644 index 0000000000000..0953e35b33979 --- /dev/null +++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll @@ -0,0 +1,601 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 + +; This tests codegen time inlining/optimization of memcmp +; rdar://6480398 + +@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1 + +declare dso_local i32 @memcmp(i8*, i8*, i32) +declare dso_local i32 @bcmp(i8*, i8*, i32) + +define i32 @length2(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + ret i32 %m +} + +define i1 @length2_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length2_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_const(i8* %X) nounwind !prof !14 { +; X86-LABEL: length2_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 2) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length2_eq_nobuiltin_attr: +; X86: # %bb.0: +; X86-NEXT: pushl $2 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length3: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: rolw $8, %dx +; X86-NEXT: rolw $8, %si +; X86-NEXT: cmpw %si, %dx +; X86-NEXT: jne .LBB4_3 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB4_2 +; X86-NEXT: .LBB4_3: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB4_2: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind + ret i32 %m +} + +define i1 @length3_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length3_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: xorw (%eax), %dx +; X86-NEXT: movb 2(%ecx), %cl +; X86-NEXT: xorb 2(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orw %dx, %ax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length4(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length4: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: seta %al +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + ret i32 %m +} + +define i1 @length4_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length4_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length4_eq_const(i8* %X) nounwind !prof !14 { +; X86-LABEL: length4_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 4) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length5: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB9_3 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB9_2 +; X86-NEXT: .LBB9_3: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB9_2: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind + ret i32 %m +} + +define i1 @length5_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length5_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: movb 4(%ecx), %cl +; X86-NEXT: xorb 4(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length8(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length8: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB11_2 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB11_3 +; X86-NEXT: .LBB11_2: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB11_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind + ret i32 %m +} + +define i1 @length8_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length8_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length8_eq_const(i8* %X) nounwind !prof !14 { +; X86-LABEL: length8_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 +; X86-NEXT: xorl (%eax), %ecx +; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 +; X86-NEXT: xorl 4(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 8) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length12_eq(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length12_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length12(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length12: +; X86: # %bb.0: +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind + ret i32 %m +} + +; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 + +define i32 @length16(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length16: +; X86: # %bb.0: +; X86-NEXT: pushl $16 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 16) nounwind + ret i32 %m +} + +define i1 @length16_eq(i8* %x, i8* %y) nounwind !prof !14 { +; X86-NOSSE-LABEL: length16_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_eq_const(i8* %X) nounwind !prof !14 { +; X86-NOSSE-LABEL: length16_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 16) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 + +define i32 @length24(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length24: +; X86: # %bb.0: +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 24) nounwind + ret i32 %m +} + +define i1 @length24_eq(i8* %x, i8* %y) nounwind !prof !14 { +; X86-NOSSE-LABEL: length24_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length24_eq_const(i8* %X) nounwind !prof !14 { +; X86-NOSSE-LABEL: length24_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 24) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length32(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length32: +; X86: # %bb.0: +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 32) nounwind + ret i32 %m +} + +; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 + +define i1 @length32_eq(i8* %x, i8* %y) nounwind !prof !14 { +; X86-NOSSE-LABEL: length32_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_const(i8* %X) nounwind !prof !14 { +; X86-NOSSE-LABEL: length32_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 32) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length64(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: length64: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 64) nounwind + ret i32 %m +} + +define i1 @length64_eq(i8* %x, i8* %y) nounwind !prof !14 { +; X86-LABEL: length64_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_eq_const(i8* %X) nounwind !prof !14 { +; X86-LABEL: length64_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 64) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind !prof !14 { +; X86-LABEL: bcmp_length2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: retl + %m = tail call i32 @bcmp(i8* %X, i8* %Y, i32 2) nounwind + ret i32 %m +} + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i32 10000} +!4 = !{!"MaxCount", i32 10} +!5 = !{!"MaxInternalCount", i32 1} +!6 = !{!"MaxFunctionCount", i32 1000} +!7 = !{!"NumCounts", i32 3} +!8 = !{!"NumFunctions", i32 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i32 100, i32 1} +!12 = !{i32 999000, i32 100, i32 1} +!13 = !{i32 999999, i32 1, i32 2} +!14 = !{!"function_entry_count", i32 0} diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll index b39f0d70a56b9..f763d91b8f774 100644 --- a/llvm/test/CodeGen/X86/memcmp-pgso.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll @@ -1,6 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 @@ -14,19 +12,6 @@ declare dso_local i32 @memcmp(i8*, i8*, i64) declare dso_local i32 @bcmp(i8*, i8*, i64) define i32 @length2(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length2: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %dx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl %dx, %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: retl -; ; X64-LABEL: length2: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -42,15 +27,6 @@ define i32 @length2(i8* %X, i8* %Y) nounwind !prof !14 { } define i1 @length2_eq(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length2_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: cmpw (%eax), %cx -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -63,14 +39,6 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind !prof !14 { } define i1 @length2_eq_const(i8* %X) nounwind !prof !14 { -; X86-LABEL: length2_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %eax -; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq_const: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -83,18 +51,6 @@ define i1 @length2_eq_const(i8* %X) nounwind !prof !14 { } define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length2_eq_nobuiltin_attr: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $2 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq_nobuiltin_attr: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -110,30 +66,6 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind !prof !14 { } define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length3: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: movzwl (%ecx), %esi -; X86-NEXT: rolw $8, %dx -; X86-NEXT: rolw $8, %si -; X86-NEXT: cmpw %si, %dx -; X86-NEXT: jne .LBB4_3 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movzbl 2(%eax), %eax -; X86-NEXT: movzbl 2(%ecx), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: jmp .LBB4_2 -; X86-NEXT: .LBB4_3: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB4_2: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length3: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -157,19 +89,6 @@ define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 { } define i1 @length3_eq(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length3_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %edx -; X86-NEXT: xorw (%eax), %dx -; X86-NEXT: movb 2(%ecx), %cl -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orw %dx, %ax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length3_eq: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -186,20 +105,6 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind !prof !14 { } define i32 @length4(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length4: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax -; X86-NEXT: retl -; ; X64-LABEL: length4: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %ecx @@ -216,15 +121,6 @@ define i32 @length4(i8* %X, i8* %Y) nounwind !prof !14 { } define i1 @length4_eq(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length4_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: cmpl (%eax), %ecx -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length4_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -237,13 +133,6 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind !prof !14 { } define i1 @length4_eq_const(i8* %X) nounwind !prof !14 { -; X86-LABEL: length4_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length4_eq_const: ; X64: # %bb.0: ; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 @@ -255,30 +144,6 @@ define i1 @length4_eq_const(i8* %X) nounwind !prof !14 { } define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length5: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl (%ecx), %esi -; X86-NEXT: bswapl %edx -; X86-NEXT: bswapl %esi -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: jne .LBB9_3 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movzbl 4(%eax), %eax -; X86-NEXT: movzbl 4(%ecx), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: jmp .LBB9_2 -; X86-NEXT: .LBB9_3: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB9_2: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length5: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -302,19 +167,6 @@ define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 { } define i1 @length5_eq(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length5_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: movb 4(%ecx), %cl -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length5_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -331,34 +183,6 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind !prof !14 { } define i32 @length8(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length8: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB11_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB11_3 -; X86-NEXT: .LBB11_2: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB11_3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length8: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -375,18 +199,6 @@ define i32 @length8(i8* %X, i8* %Y) nounwind !prof !14 { } define i1 @length8_eq(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length8_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length8_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -399,17 +211,6 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind !prof !14 { } define i1 @length8_eq_const(i8* %X) nounwind !prof !14 { -; X86-LABEL: length8_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 -; X86-NEXT: xorl (%eax), %ecx -; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 -; X86-NEXT: xorl 4(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length8_eq_const: ; X64: # %bb.0: ; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 @@ -422,18 +223,6 @@ define i1 @length8_eq_const(i8* %X) nounwind !prof !14 { } define i1 @length12_eq(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length12_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length12_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -449,16 +238,6 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind !prof !14 { } define i32 @length12(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length12: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length12: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -489,16 +268,6 @@ define i32 @length12(i8* %X, i8* %Y) nounwind !prof !14 { ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 define i32 @length16(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length16: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $16 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length16: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -527,30 +296,6 @@ define i32 @length16(i8* %X, i8* %Y) nounwind !prof !14 { } define i1 @length16_eq(i8* %x, i8* %y) nounwind !prof !14 { -; X86-NOSSE-LABEL: length16_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu (%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length16_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -574,28 +319,6 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind !prof !14 { } define i1 @length16_eq_const(i8* %X) nounwind !prof !14 { -; X86-NOSSE-LABEL: length16_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length16_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -620,16 +343,6 @@ define i1 @length16_eq_const(i8* %X) nounwind !prof !14 { ; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 define i32 @length24(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length24: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length24: ; X64: # %bb.0: ; X64-NEXT: movl $24, %edx @@ -639,34 +352,6 @@ define i32 @length24(i8* %X, i8* %Y) nounwind !prof !14 { } define i1 @length24_eq(i8* %x, i8* %y) nounwind !prof !14 { -; X86-NOSSE-LABEL: length24_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length24_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -698,31 +383,6 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind !prof !14 { } define i1 @length24_eq_const(i8* %X) nounwind !prof !14 { -; X86-NOSSE-LABEL: length24_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -751,16 +411,6 @@ define i1 @length24_eq_const(i8* %X) nounwind !prof !14 { } define i32 @length32(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length32: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length32: ; X64: # %bb.0: ; X64-NEXT: movl $32, %edx @@ -772,34 +422,6 @@ define i32 @length32(i8* %X, i8* %Y) nounwind !prof !14 { ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 define i1 @length32_eq(i8* %x, i8* %y) nounwind !prof !14 { -; X86-NOSSE-LABEL: length32_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $32 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length32_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -837,31 +459,6 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind !prof !14 { } define i1 @length32_eq_const(i8* %X) nounwind !prof !14 { -; X86-NOSSE-LABEL: length32_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $32 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; ; X64-SSE2-LABEL: length32_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -897,16 +494,6 @@ define i1 @length32_eq_const(i8* %X) nounwind !prof !14 { } define i32 @length64(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: length64: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length64: ; X64: # %bb.0: ; X64-NEXT: movl $64, %edx @@ -916,18 +503,6 @@ define i32 @length64(i8* %X, i8* %Y) nounwind !prof !14 { } define i1 @length64_eq(i8* %x, i8* %y) nounwind !prof !14 { -; X86-LABEL: length64_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE2-LABEL: length64_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pushq %rax @@ -967,18 +542,6 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind !prof !14 { } define i1 @length64_eq_const(i8* %X) nounwind !prof !14 { -; X86-LABEL: length64_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE2-LABEL: length64_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pushq %rax @@ -1019,19 +582,6 @@ define i1 @length64_eq_const(i8* %X) nounwind !prof !14 { } define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind !prof !14 { -; X86-LABEL: bcmp_length2: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %dx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl %dx, %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: retl -; ; X64-LABEL: bcmp_length2: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll new file mode 100644 index 0000000000000..ad7a1c0e0a492 --- /dev/null +++ b/llvm/test/CodeGen/X86/memcmp-x32.ll @@ -0,0 +1,2434 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86,X86-SSE1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86,X86-SSE41 + +; This tests codegen time inlining/optimization of memcmp +; rdar://6480398 + +@.str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1 + +declare dso_local i32 @memcmp(i8*, i8*, i32) + +define i32 @length0(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length0: +; X86: # %bb.0: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind + ret i32 %m + } + +define i1 @length0_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length0_eq: +; X86: # %bb.0: +; X86-NEXT: movb $1, %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length0_lt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length0_lt: +; X86: # %bb.0: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind + %c = icmp slt i32 %m, 0 + ret i1 %c +} + +define i32 @length2(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + ret i32 %m +} + +define i32 @length2_const(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length2_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: rolw $8, %ax +; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: addl $-12594, %eax # imm = 0xCECE +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 2) nounwind + ret i32 %m +} + +define i1 @length2_gt_const(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length2_gt_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: rolw $8, %ax +; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: addl $-12594, %eax # imm = 0xCECE +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 2) nounwind + %c = icmp sgt i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length2_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length2_lt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length2_lt: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %dx +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + %c = icmp slt i32 %m, 0 + ret i1 %c +} + +define i1 @length2_gt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length2_gt: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: rolw $8, %cx +; X86-NEXT: rolw $8, %ax +; X86-NEXT: movzwl %cx, %ecx +; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: setg %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind + %c = icmp sgt i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_const(i8* %X) nounwind { +; X86-LABEL: length2_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 2) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length2_eq_nobuiltin_attr: +; X86: # %bb.0: +; X86-NEXT: pushl $2 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length3(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length3: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl (%ecx), %esi +; X86-NEXT: rolw $8, %dx +; X86-NEXT: rolw $8, %si +; X86-NEXT: cmpw %si, %dx +; X86-NEXT: jne .LBB11_3 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB11_3: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind + ret i32 %m +} + +define i1 @length3_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length3_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: xorw (%eax), %dx +; X86-NEXT: movb 2(%ecx), %cl +; X86-NEXT: xorb 2(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orw %dx, %ax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length4(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length4: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: seta %al +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + ret i32 %m +} + +define i1 @length4_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length4_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length4_lt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length4_lt: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: seta %al +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + %c = icmp slt i32 %m, 0 + ret i1 %c +} + +define i1 @length4_gt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length4_gt: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %ecx +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: seta %dl +; X86-NEXT: sbbl $0, %edx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: setg %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind + %c = icmp sgt i32 %m, 0 + ret i1 %c +} + +define i1 @length4_eq_const(i8* %X) nounwind { +; X86-LABEL: length4_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 4) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length5(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length5: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB18_3 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB18_3: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind + ret i32 %m +} + +define i1 @length5_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length5_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: movb 4(%ecx), %cl +; X86-NEXT: xorb 4(%eax), %cl +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length5_lt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length5_lt: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: bswapl %edx +; X86-NEXT: bswapl %esi +; X86-NEXT: cmpl %esi, %edx +; X86-NEXT: jne .LBB20_3 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: jmp .LBB20_2 +; X86-NEXT: .LBB20_3: # %res_block +; X86-NEXT: setae %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB20_2: # %endblock +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind + %c = icmp slt i32 %m, 0 + ret i1 %c +} + +define i32 @length7(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length7: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB21_2 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 3(%esi), %ecx +; X86-NEXT: movl 3(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB21_3 +; X86-NEXT: .LBB21_2: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB21_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind + ret i32 %m +} + +define i1 @length7_lt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length7_lt: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB22_2 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 3(%esi), %ecx +; X86-NEXT: movl 3(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB22_3 +; X86-NEXT: .LBB22_2: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB22_3: # %endblock +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind + %c = icmp slt i32 %m, 0 + ret i1 %c +} + +define i1 @length7_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length7_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 3(%ecx), %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 3(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length8(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length8: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esi), %ecx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: jne .LBB24_2 +; X86-NEXT: # %bb.1: # %loadbb1 +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: bswapl %ecx +; X86-NEXT: bswapl %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: je .LBB24_3 +; X86-NEXT: .LBB24_2: # %res_block +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setae %al +; X86-NEXT: leal -1(%eax,%eax), %eax +; X86-NEXT: .LBB24_3: # %endblock +; X86-NEXT: popl %esi +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind + ret i32 %m +} + +define i1 @length8_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length8_eq: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %edx +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: xorl (%eax), %edx +; X86-NEXT: xorl 4(%eax), %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length8_eq_const(i8* %X) nounwind { +; X86-LABEL: length8_eq_const: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 +; X86-NEXT: xorl (%eax), %ecx +; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 +; X86-NEXT: xorl 4(%eax), %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 8) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i1 @length9_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length9_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $9 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length10_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length10_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $10 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 10) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length11_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length11_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $11 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 11) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length12_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length12_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length12(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length12: +; X86: # %bb.0: +; X86-NEXT: pushl $12 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind + ret i32 %m +} + +define i1 @length13_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length13_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $13 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 13) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length14_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length14_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $14 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 14) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length15(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length15: +; X86: # %bb.0: +; X86-NEXT: pushl $15 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 15) nounwind + ret i32 %m +} + +define i1 @length15_lt(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length15_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $15 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 15) nounwind + %c = icmp slt i32 %m, 0 + ret i1 %c +} + +define i32 @length15_const(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length15_const: +; X86: # %bb.0: +; X86-NEXT: pushl $15 +; X86-NEXT: pushl $.L.str+1 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 15) nounwind + ret i32 %m +} + +define i1 @length15_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length15_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $15 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 15) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i1 @length15_gt_const(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length15_gt_const: +; X86: # %bb.0: +; X86-NEXT: pushl $15 +; X86-NEXT: pushl $.L.str+1 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 15) nounwind + %c = icmp sgt i32 %m, 0 + ret i1 %c +} + +; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 + +define i32 @length16(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length16: +; X86: # %bb.0: +; X86-NEXT: pushl $16 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 16) nounwind + ret i32 %m +} + +define i1 @length16_eq(i8* %x, i8* %y) nounwind { +; X86-NOSSE-LABEL: length16_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length16_eq: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $16 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu (%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length16_eq: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu (%eax), %xmm1 +; X86-SSE41-NEXT: pxor %xmm0, %xmm1 +; X86-SSE41-NEXT: ptest %xmm1, %xmm1 +; X86-SSE41-NEXT: setne %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length16_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $16 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length16_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $16 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length16_eq_const(i8* %X) nounwind { +; X86-NOSSE-LABEL: length16_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $16 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length16_eq_const: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $16 +; X86-SSE1-NEXT: pushl $.L.str +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length16_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length16_eq_const: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 16) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 + +define i32 @length24(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length24: +; X86: # %bb.0: +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 24) nounwind + ret i32 %m +} + +define i1 @length24_eq(i8* %x, i8* %y) nounwind { +; X86-NOSSE-LABEL: length24_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length24_eq: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $24 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length24_eq: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 8(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 8(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length24_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length24_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length24_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length24_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $24 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length24_eq_const(i8* %X) nounwind { +; X86-NOSSE-LABEL: length24_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $24 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length24_eq_const: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $24 +; X86-SSE1-NEXT: pushl $.L.str +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length24_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length24_eq_const: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu 8(%eax), %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: setne %al +; X86-SSE41-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 24) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length31(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length31: +; X86: # %bb.0: +; X86-NEXT: pushl $31 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 31) nounwind + ret i32 %m +} + +define i1 @length31_eq(i8* %x, i8* %y) nounwind { +; X86-NOSSE-LABEL: length31_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $31 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length31_eq: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $31 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length31_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 15(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length31_eq: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 15(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length31_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length31_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $31 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length31_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length31_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $31 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" { +; X86-NOSSE-LABEL: length31_eq_prefer128: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $31 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length31_eq_prefer128: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $31 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length31_eq_prefer128: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 15(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length31_eq_prefer128: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 15(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length31_eq_const(i8* %X) nounwind { +; X86-NOSSE-LABEL: length31_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $31 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length31_eq_const: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $31 +; X86-SSE1-NEXT: pushl $.L.str +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length31_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 15(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length31_eq_const: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu 15(%eax), %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: setne %al +; X86-SSE41-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 31) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length32(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length32: +; X86: # %bb.0: +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 32) nounwind + ret i32 %m +} + +; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 + +define i1 @length32_eq(i8* %x, i8* %y) nounwind { +; X86-NOSSE-LABEL: length32_eq: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length32_eq: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $32 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length32_eq: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length32_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length32_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $32 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" { +; X86-NOSSE-LABEL: length32_eq_prefer128: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: sete %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length32_eq_prefer128: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $32 +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: sete %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq_prefer128: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: sete %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length32_eq_prefer128: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 +; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: sete %al +; X86-SSE41-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length32_eq_const(i8* %X) nounwind { +; X86-NOSSE-LABEL: length32_eq_const: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl $32 +; X86-NOSSE-NEXT: pushl $.L.str +; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: calll memcmp +; X86-NOSSE-NEXT: addl $12, %esp +; X86-NOSSE-NEXT: testl %eax, %eax +; X86-NOSSE-NEXT: setne %al +; X86-NOSSE-NEXT: retl +; +; X86-SSE1-LABEL: length32_eq_const: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl $32 +; X86-SSE1-NEXT: pushl $.L.str +; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: calll memcmp +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: testl %eax, %eax +; X86-SSE1-NEXT: setne %al +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: length32_eq_const: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: setne %al +; X86-SSE2-NEXT: retl +; +; X86-SSE41-LABEL: length32_eq_const: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: ptest %xmm0, %xmm0 +; X86-SSE41-NEXT: setne %al +; X86-SSE41-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 32) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length48(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length48: +; X86: # %bb.0: +; X86-NEXT: pushl $48 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 48) nounwind + ret i32 %m +} + +define i1 @length48_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length48_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $48 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length48_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length48_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $48 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length48_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length48_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $48 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" { +; X86-LABEL: length48_eq_prefer128: +; X86: # %bb.0: +; X86-NEXT: pushl $48 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind + %cmp = icmp eq i32 %call, 0 + ret i1 %cmp +} + +define i1 @length48_eq_const(i8* %X) nounwind { +; X86-LABEL: length48_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $48 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 48) nounwind + %c = icmp ne i32 %m, 0 + ret i1 %c +} + +define i32 @length63(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length63: +; X86: # %bb.0: +; X86-NEXT: pushl $63 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 63) nounwind + ret i32 %m +} + +define i1 @length63_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length63_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $63 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length63_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length63_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $63 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length63_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length63_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $63 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length63_eq_const(i8* %X) nounwind { +; X86-LABEL: length63_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $63 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 63) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length64(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length64: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 64) nounwind + ret i32 %m +} + +define i1 @length64_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length64_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length64_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length64_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length64_eq_const(i8* %X) nounwind { +; X86-LABEL: length64_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $64 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 64) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length96(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length96: +; X86: # %bb.0: +; X86-NEXT: pushl $96 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 96) nounwind + ret i32 %m +} + +define i1 @length96_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length96_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $96 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length96_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length96_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $96 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length96_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length96_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $96 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length96_eq_const(i8* %X) nounwind { +; X86-LABEL: length96_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $96 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 96) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length127(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length127: +; X86: # %bb.0: +; X86-NEXT: pushl $127 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 127) nounwind + ret i32 %m +} + +define i1 @length127_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length127_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $127 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length127_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length127_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $127 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length127_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length127_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $127 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length127_eq_const(i8* %X) nounwind { +; X86-LABEL: length127_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $127 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 127) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length128(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length128: +; X86: # %bb.0: +; X86-NEXT: pushl $128 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 128) nounwind + ret i32 %m +} + +define i1 @length128_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length128_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $128 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length128_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length128_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $128 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length128_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length128_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $128 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length128_eq_const(i8* %X) nounwind { +; X86-LABEL: length128_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $128 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 128) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length192(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length192: +; X86: # %bb.0: +; X86-NEXT: pushl $192 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 192) nounwind + ret i32 %m +} + +define i1 @length192_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length192_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $192 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length192_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length192_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $192 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length192_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length192_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $192 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length192_eq_const(i8* %X) nounwind { +; X86-LABEL: length192_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $192 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 192) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length255(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length255: +; X86: # %bb.0: +; X86-NEXT: pushl $255 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 255) nounwind + ret i32 %m +} + +define i1 @length255_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length255_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $255 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length255_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length255_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $255 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length255_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length255_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $255 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length255_eq_const(i8* %X) nounwind { +; X86-LABEL: length255_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $255 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 255) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length256(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length256: +; X86: # %bb.0: +; X86-NEXT: pushl $256 # imm = 0x100 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 256) nounwind + ret i32 %m +} + +define i1 @length256_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length256_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $256 # imm = 0x100 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length256_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length256_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $256 # imm = 0x100 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length256_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length256_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $256 # imm = 0x100 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length256_eq_const(i8* %X) nounwind { +; X86-LABEL: length256_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $256 # imm = 0x100 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 256) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length384(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length384: +; X86: # %bb.0: +; X86-NEXT: pushl $384 # imm = 0x180 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 384) nounwind + ret i32 %m +} + +define i1 @length384_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length384_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $384 # imm = 0x180 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length384_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length384_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $384 # imm = 0x180 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length384_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length384_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $384 # imm = 0x180 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length384_eq_const(i8* %X) nounwind { +; X86-LABEL: length384_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $384 # imm = 0x180 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 384) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length511(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length511: +; X86: # %bb.0: +; X86-NEXT: pushl $511 # imm = 0x1FF +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 511) nounwind + ret i32 %m +} + +define i1 @length511_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length511_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $511 # imm = 0x1FF +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length511_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length511_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $511 # imm = 0x1FF +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length511_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length511_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $511 # imm = 0x1FF +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length511_eq_const(i8* %X) nounwind { +; X86-LABEL: length511_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $511 # imm = 0x1FF +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 511) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +define i32 @length512(i8* %X, i8* %Y) nounwind { +; X86-LABEL: length512: +; X86: # %bb.0: +; X86-NEXT: pushl $512 # imm = 0x200 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 512) nounwind + ret i32 %m +} + +define i1 @length512_eq(i8* %x, i8* %y) nounwind { +; X86-LABEL: length512_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $512 # imm = 0x200 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setne %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind + %cmp = icmp ne i32 %call, 0 + ret i1 %cmp +} + +define i1 @length512_lt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length512_lt: +; X86: # %bb.0: +; X86-NEXT: pushl $512 # imm = 0x200 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: shrl $31, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind + %cmp = icmp slt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length512_gt(i8* %x, i8* %y) nounwind { +; X86-LABEL: length512_gt: +; X86: # %bb.0: +; X86-NEXT: pushl $512 # imm = 0x200 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: setg %al +; X86-NEXT: retl + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind + %cmp = icmp sgt i32 %call, 0 + ret i1 %cmp +} + +define i1 @length512_eq_const(i8* %X) nounwind { +; X86-LABEL: length512_eq_const: +; X86: # %bb.0: +; X86-NEXT: pushl $512 # imm = 0x200 +; X86-NEXT: pushl $.L.str +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 512) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; This checks that we do not do stupid things with huge sizes. +define i32 @huge_length(i8* %X, i8* %Y) nounwind { +; X86-LABEL: huge_length: +; X86: # %bb.0: +; X86-NEXT: pushl $-1 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9223372036854775807) nounwind + ret i32 %m +} + +define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: huge_length_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $-1 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9223372036854775807) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; This checks non-constant sizes. +define i32 @nonconst_length(i8* %X, i8* %Y, i32 %size) nounwind { +; X86-LABEL: nonconst_length: +; X86: # %bb.0: +; X86-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 %size) nounwind + ret i32 %m +} + +define i1 @nonconst_length_eq(i8* %X, i8* %Y, i32 %size) nounwind { +; X86-LABEL: nonconst_length_eq: +; X86: # %bb.0: +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $12, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 %size) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll index d498d85711f80..bf11d3d18b2ed 100644 --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -1,8 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86,X86-SSE1 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86,X86-SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 @@ -22,11 +18,6 @@ declare dso_local i32 @memcmp(i8*, i8*, i64) define i32 @length0(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length0: -; X86: # %bb.0: -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: retl -; ; X64-LABEL: length0: ; X64: # %bb.0: ; X64-NEXT: xorl %eax, %eax @@ -36,11 +27,6 @@ define i32 @length0(i8* %X, i8* %Y) nounwind { } define i1 @length0_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length0_eq: -; X86: # %bb.0: -; X86-NEXT: movb $1, %al -; X86-NEXT: retl -; ; X64-LABEL: length0_eq: ; X64: # %bb.0: ; X64-NEXT: movb $1, %al @@ -51,11 +37,6 @@ define i1 @length0_eq(i8* %X, i8* %Y) nounwind { } define i1 @length0_lt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length0_lt: -; X86: # %bb.0: -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: retl -; ; X64-LABEL: length0_lt: ; X64: # %bb.0: ; X64-NEXT: xorl %eax, %eax @@ -66,19 +47,6 @@ define i1 @length0_lt(i8* %X, i8* %Y) nounwind { } define i32 @length2(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length2: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %dx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl %dx, %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: retl -; ; X64-LABEL: length2: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -94,15 +62,6 @@ define i32 @length2(i8* %X, i8* %Y) nounwind { } define i32 @length2_const(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length2_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %eax -; X86-NEXT: rolw $8, %ax -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: addl $-12594, %eax # imm = 0xCECE -; X86-NEXT: retl -; ; X64-LABEL: length2_const: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -115,17 +74,6 @@ define i32 @length2_const(i8* %X, i8* %Y) nounwind { } define i1 @length2_gt_const(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length2_gt_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %eax -; X86-NEXT: rolw $8, %ax -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: addl $-12594, %eax # imm = 0xCECE -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length2_gt_const: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -141,15 +89,6 @@ define i1 @length2_gt_const(i8* %X, i8* %Y) nounwind { } define i1 @length2_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length2_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: cmpw (%eax), %cx -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -162,21 +101,6 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind { } define i1 @length2_lt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length2_lt: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %dx -; X86-NEXT: movzwl %cx, %eax -; X86-NEXT: movzwl %dx, %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length2_lt: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -195,21 +119,6 @@ define i1 @length2_lt(i8* %X, i8* %Y) nounwind { } define i1 @length2_gt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length2_gt: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %eax -; X86-NEXT: rolw $8, %cx -; X86-NEXT: rolw $8, %ax -; X86-NEXT: movzwl %cx, %ecx -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length2_gt: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -228,14 +137,6 @@ define i1 @length2_gt(i8* %X, i8* %Y) nounwind { } define i1 @length2_eq_const(i8* %X) nounwind { -; X86-LABEL: length2_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %eax -; X86-NEXT: cmpl $12849, %eax # imm = 0x3231 -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq_const: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -248,18 +149,6 @@ define i1 @length2_eq_const(i8* %X) nounwind { } define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length2_eq_nobuiltin_attr: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $2 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length2_eq_nobuiltin_attr: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -275,30 +164,6 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind { } define i32 @length3(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length3: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx -; X86-NEXT: movzwl (%ecx), %esi -; X86-NEXT: rolw $8, %dx -; X86-NEXT: rolw $8, %si -; X86-NEXT: cmpw %si, %dx -; X86-NEXT: jne .LBB11_3 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movzbl 2(%eax), %eax -; X86-NEXT: movzbl 2(%ecx), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; X86-NEXT: .LBB11_3: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length3: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -322,19 +187,6 @@ define i32 @length3(i8* %X, i8* %Y) nounwind { } define i1 @length3_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length3_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %edx -; X86-NEXT: xorw (%eax), %dx -; X86-NEXT: movb 2(%ecx), %cl -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orw %dx, %ax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length3_eq: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax @@ -351,20 +203,6 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind { } define i32 @length4(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length4: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax -; X86-NEXT: retl -; ; X64-LABEL: length4: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %ecx @@ -381,15 +219,6 @@ define i32 @length4(i8* %X, i8* %Y) nounwind { } define i1 @length4_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length4_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: cmpl (%eax), %ecx -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length4_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -402,22 +231,6 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind { } define i1 @length4_lt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length4_lt: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: seta %al -; X86-NEXT: sbbl $0, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length4_lt: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %ecx @@ -437,22 +250,6 @@ define i1 @length4_lt(i8* %X, i8* %Y) nounwind { } define i1 @length4_gt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length4_gt: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: movl (%eax), %eax -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %eax, %ecx -; X86-NEXT: seta %dl -; X86-NEXT: sbbl $0, %edx -; X86-NEXT: testl %edx, %edx -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length4_gt: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -472,13 +269,6 @@ define i1 @length4_gt(i8* %X, i8* %Y) nounwind { } define i1 @length4_eq_const(i8* %X) nounwind { -; X86-LABEL: length4_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231 -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length4_eq_const: ; X64: # %bb.0: ; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231 @@ -490,30 +280,6 @@ define i1 @length4_eq_const(i8* %X) nounwind { } define i32 @length5(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length5: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl (%ecx), %esi -; X86-NEXT: bswapl %edx -; X86-NEXT: bswapl %esi -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: jne .LBB18_3 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movzbl 4(%eax), %eax -; X86-NEXT: movzbl 4(%ecx), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; X86-NEXT: .LBB18_3: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length5: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -537,19 +303,6 @@ define i32 @length5(i8* %X, i8* %Y) nounwind { } define i1 @length5_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length5_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: movb 4(%ecx), %cl -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length5_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -566,32 +319,6 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind { } define i1 @length5_lt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length5_lt: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl (%ecx), %esi -; X86-NEXT: bswapl %edx -; X86-NEXT: bswapl %esi -; X86-NEXT: cmpl %esi, %edx -; X86-NEXT: jne .LBB20_3 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movzbl 4(%eax), %eax -; X86-NEXT: movzbl 4(%ecx), %ecx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: jmp .LBB20_2 -; X86-NEXT: .LBB20_3: # %res_block -; X86-NEXT: setae %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB20_2: # %endblock -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length5_lt: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -620,34 +347,6 @@ define i1 @length5_lt(i8* %X, i8* %Y) nounwind { } define i32 @length7(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length7: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB21_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 3(%esi), %ecx -; X86-NEXT: movl 3(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB21_3 -; X86-NEXT: .LBB21_2: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB21_3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length7: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %ecx @@ -676,36 +375,6 @@ define i32 @length7(i8* %X, i8* %Y) nounwind { } define i1 @length7_lt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length7_lt: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB22_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 3(%esi), %ecx -; X86-NEXT: movl 3(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB22_3 -; X86-NEXT: .LBB22_2: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB22_3: # %endblock -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length7_lt: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %ecx @@ -737,18 +406,6 @@ define i1 @length7_lt(i8* %X, i8* %Y) nounwind { } define i1 @length7_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length7_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 3(%ecx), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 3(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length7_eq: ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax @@ -764,34 +421,6 @@ define i1 @length7_eq(i8* %X, i8* %Y) nounwind { } define i32 @length8(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length8: -; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: jne .LBB24_2 -; X86-NEXT: # %bb.1: # %loadbb1 -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: bswapl %ecx -; X86-NEXT: bswapl %edx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: je .LBB24_3 -; X86-NEXT: .LBB24_2: # %res_block -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl %edx, %ecx -; X86-NEXT: setae %al -; X86-NEXT: leal -1(%eax,%eax), %eax -; X86-NEXT: .LBB24_3: # %endblock -; X86-NEXT: popl %esi -; X86-NEXT: retl -; ; X64-LABEL: length8: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -808,18 +437,6 @@ define i32 @length8(i8* %X, i8* %Y) nounwind { } define i1 @length8_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length8_eq: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: xorl 4(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length8_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -832,17 +449,6 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind { } define i1 @length8_eq_const(i8* %X) nounwind { -; X86-LABEL: length8_eq_const: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl $858927408, %ecx # imm = 0x33323130 -; X86-NEXT: xorl (%eax), %ecx -; X86-NEXT: movl $926299444, %edx # imm = 0x37363534 -; X86-NEXT: xorl 4(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length8_eq_const: ; X64: # %bb.0: ; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130 @@ -855,18 +461,6 @@ define i1 @length8_eq_const(i8* %X) nounwind { } define i1 @length9_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length9_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $9 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length9_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -883,18 +477,6 @@ define i1 @length9_eq(i8* %X, i8* %Y) nounwind { } define i1 @length10_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length10_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $10 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length10_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -911,18 +493,6 @@ define i1 @length10_eq(i8* %X, i8* %Y) nounwind { } define i1 @length11_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length11_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $11 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length11_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -938,18 +508,6 @@ define i1 @length11_eq(i8* %X, i8* %Y) nounwind { } define i1 @length12_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length12_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length12_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -965,16 +523,6 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind { } define i32 @length12(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length12: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $12 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length12: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -1003,18 +551,6 @@ define i32 @length12(i8* %X, i8* %Y) nounwind { } define i1 @length13_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length13_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $13 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length13_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -1030,18 +566,6 @@ define i1 @length13_eq(i8* %X, i8* %Y) nounwind { } define i1 @length14_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length14_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $14 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length14_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -1057,16 +581,6 @@ define i1 @length14_eq(i8* %X, i8* %Y) nounwind { } define i32 @length15(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length15: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $15 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length15: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -1095,18 +609,6 @@ define i32 @length15(i8* %X, i8* %Y) nounwind { } define i1 @length15_lt(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length15_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $15 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length15_lt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -1138,16 +640,6 @@ define i1 @length15_lt(i8* %X, i8* %Y) nounwind { } define i32 @length15_const(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length15_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $15 -; X86-NEXT: pushl $.L.str+1 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length15_const: ; X64: # %bb.0: ; X64-NEXT: movabsq $3544952156018063160, %rcx # imm = 0x3132333435363738 @@ -1174,18 +666,6 @@ define i32 @length15_const(i8* %X, i8* %Y) nounwind { } define i1 @length15_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length15_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $15 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length15_eq: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -1201,18 +681,6 @@ define i1 @length15_eq(i8* %X, i8* %Y) nounwind { } define i1 @length15_gt_const(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length15_gt_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $15 -; X86-NEXT: pushl $.L.str+1 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length15_gt_const: ; X64: # %bb.0: ; X64-NEXT: movabsq $3544952156018063160, %rax # imm = 0x3132333435363738 @@ -1244,16 +712,6 @@ define i1 @length15_gt_const(i8* %X, i8* %Y) nounwind { ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329 define i32 @length16(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length16: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $16 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length16: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -1282,53 +740,6 @@ define i32 @length16(i8* %X, i8* %Y) nounwind { } define i1 @length16_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length16_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length16_eq: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $16 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: setne %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu (%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length16_eq: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu (%eax), %xmm1 -; X86-SSE41-NEXT: pxor %xmm0, %xmm1 -; X86-SSE41-NEXT: ptest %xmm1, %xmm1 -; X86-SSE41-NEXT: setne %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length16_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -1371,18 +782,6 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind { } define i1 @length16_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length16_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $16 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length16_lt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rcx @@ -1414,18 +813,6 @@ define i1 @length16_lt(i8* %x, i8* %y) nounwind { } define i1 @length16_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length16_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $16 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length16_gt: ; X64: # %bb.0: ; X64-NEXT: movq (%rdi), %rax @@ -1457,49 +844,6 @@ define i1 @length16_gt(i8* %x, i8* %y) nounwind { } define i1 @length16_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length16_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $16 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length16_eq_const: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $16 -; X86-SSE1-NEXT: pushl $.L.str -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length16_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length16_eq_const: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length16_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -1542,16 +886,6 @@ define i1 @length16_eq_const(i8* %X) nounwind { ; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914 define i32 @length24(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length24: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length24: ; X64: # %bb.0: ; X64-NEXT: movl $24, %edx @@ -1561,61 +895,6 @@ define i32 @length24(i8* %X, i8* %Y) nounwind { } define i1 @length24_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length24_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length24_eq: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $24 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length24_eq: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 8(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 8(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length24_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -1673,18 +952,6 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind { } define i1 @length24_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length24_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length24_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -1700,18 +967,6 @@ define i1 @length24_lt(i8* %x, i8* %y) nounwind { } define i1 @length24_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length24_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $24 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length24_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -1727,55 +982,6 @@ define i1 @length24_gt(i8* %x, i8* %y) nounwind { } define i1 @length24_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length24_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $24 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length24_eq_const: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $24 -; X86-SSE1-NEXT: pushl $.L.str -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: setne %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length24_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length24_eq_const: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 8(%eax), %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: setne %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length24_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -1828,16 +1034,6 @@ define i1 @length24_eq_const(i8* %X) nounwind { } define i32 @length31(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length31: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $31 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length31: ; X64: # %bb.0: ; X64-NEXT: movl $31, %edx @@ -1847,61 +1043,6 @@ define i32 @length31(i8* %X, i8* %Y) nounwind { } define i1 @length31_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length31_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $31 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length31_eq: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $31 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length31_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 15(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length31_eq: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 15(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length31_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -1958,18 +1099,6 @@ define i1 @length31_eq(i8* %x, i8* %y) nounwind { } define i1 @length31_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length31_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $31 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length31_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -1985,18 +1114,6 @@ define i1 @length31_lt(i8* %x, i8* %y) nounwind { } define i1 @length31_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length31_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $31 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length31_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -2012,61 +1129,6 @@ define i1 @length31_gt(i8* %x, i8* %y) nounwind { } define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" { -; X86-NOSSE-LABEL: length31_eq_prefer128: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $31 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length31_eq_prefer128: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $31 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length31_eq_prefer128: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 15(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length31_eq_prefer128: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 15(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length31_eq_prefer128: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -2123,55 +1185,6 @@ define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"= } define i1 @length31_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length31_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $31 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length31_eq_const: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $31 -; X86-SSE1-NEXT: pushl $.L.str -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: setne %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length31_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 15(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length31_eq_const: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 15(%eax), %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: setne %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length31_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -2224,16 +1237,6 @@ define i1 @length31_eq_const(i8* %X) nounwind { } define i32 @length32(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length32: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length32: ; X64: # %bb.0: ; X64-NEXT: movl $32, %edx @@ -2245,61 +1248,6 @@ define i32 @length32(i8* %X, i8* %Y) nounwind { ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325 define i1 @length32_eq(i8* %x, i8* %y) nounwind { -; X86-NOSSE-LABEL: length32_eq: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $32 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length32_eq: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $32 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length32_eq: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length32_eq: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length32_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -2369,18 +1317,6 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind { } define i1 @length32_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length32_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length32_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -2396,18 +1332,6 @@ define i1 @length32_lt(i8* %x, i8* %y) nounwind { } define i1 @length32_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length32_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $32 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length32_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -2423,61 +1347,6 @@ define i1 @length32_gt(i8* %x, i8* %y) nounwind { } define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" { -; X86-NOSSE-LABEL: length32_eq_prefer128: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $32 -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: sete %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length32_eq_prefer128: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $32 -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: sete %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length32_eq_prefer128: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu (%eax), %xmm2 -; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: sete %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length32_eq_prefer128: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu (%eax), %xmm2 -; X86-SSE41-NEXT: pxor %xmm0, %xmm2 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 -; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: por %xmm2, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: sete %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length32_eq_prefer128: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -2534,55 +1403,6 @@ define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"= } define i1 @length32_eq_const(i8* %X) nounwind { -; X86-NOSSE-LABEL: length32_eq_const: -; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl $0 -; X86-NOSSE-NEXT: pushl $32 -; X86-NOSSE-NEXT: pushl $.L.str -; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NOSSE-NEXT: calll memcmp -; X86-NOSSE-NEXT: addl $16, %esp -; X86-NOSSE-NEXT: testl %eax, %eax -; X86-NOSSE-NEXT: setne %al -; X86-NOSSE-NEXT: retl -; -; X86-SSE1-LABEL: length32_eq_const: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: pushl $0 -; X86-SSE1-NEXT: pushl $32 -; X86-SSE1-NEXT: pushl $.L.str -; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: calll memcmp -; X86-SSE1-NEXT: addl $16, %esp -; X86-SSE1-NEXT: testl %eax, %eax -; X86-SSE1-NEXT: setne %al -; X86-SSE1-NEXT: retl -; -; X86-SSE2-LABEL: length32_eq_const: -; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; X86-SSE2-NEXT: setne %al -; X86-SSE2-NEXT: retl -; -; X86-SSE41-LABEL: length32_eq_const: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE41-NEXT: por %xmm1, %xmm0 -; X86-SSE41-NEXT: ptest %xmm0, %xmm0 -; X86-SSE41-NEXT: setne %al -; X86-SSE41-NEXT: retl -; ; X64-SSE2-LABEL: length32_eq_const: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 @@ -2648,16 +1468,6 @@ define i1 @length32_eq_const(i8* %X) nounwind { } define i32 @length48(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length48: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $48 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length48: ; X64: # %bb.0: ; X64-NEXT: movl $48, %edx @@ -2667,18 +1477,6 @@ define i32 @length48(i8* %X, i8* %Y) nounwind { } define i1 @length48_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length48_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $48 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length48_eq: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -2746,18 +1544,6 @@ define i1 @length48_eq(i8* %x, i8* %y) nounwind { } define i1 @length48_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length48_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $48 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length48_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -2773,18 +1559,6 @@ define i1 @length48_lt(i8* %x, i8* %y) nounwind { } define i1 @length48_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length48_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $48 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length48_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -2800,18 +1574,6 @@ define i1 @length48_gt(i8* %x, i8* %y) nounwind { } define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" { -; X86-LABEL: length48_eq_prefer128: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $48 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length48_eq_prefer128: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -2827,18 +1589,6 @@ define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"= } define i1 @length48_eq_const(i8* %X) nounwind { -; X86-LABEL: length48_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $48 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length48_eq_const: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -2904,16 +1654,6 @@ define i1 @length48_eq_const(i8* %X) nounwind { } define i32 @length63(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length63: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $63 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length63: ; X64: # %bb.0: ; X64-NEXT: movl $63, %edx @@ -2923,18 +1663,6 @@ define i32 @length63(i8* %X, i8* %Y) nounwind { } define i1 @length63_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length63_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $63 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length63_eq: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -2999,18 +1727,6 @@ define i1 @length63_eq(i8* %x, i8* %y) nounwind { } define i1 @length63_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length63_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $63 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length63_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -3026,18 +1742,6 @@ define i1 @length63_lt(i8* %x, i8* %y) nounwind { } define i1 @length63_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length63_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $63 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length63_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -3053,18 +1757,6 @@ define i1 @length63_gt(i8* %x, i8* %y) nounwind { } define i1 @length63_eq_const(i8* %X) nounwind { -; X86-LABEL: length63_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $63 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length63_eq_const: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -3130,16 +1822,6 @@ define i1 @length63_eq_const(i8* %X) nounwind { } define i32 @length64(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length64: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length64: ; X64: # %bb.0: ; X64-NEXT: movl $64, %edx @@ -3149,18 +1831,6 @@ define i32 @length64(i8* %X, i8* %Y) nounwind { } define i1 @length64_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length64_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length64_eq: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -3240,18 +1910,6 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind { } define i1 @length64_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length64_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length64_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -3267,18 +1925,6 @@ define i1 @length64_lt(i8* %x, i8* %y) nounwind { } define i1 @length64_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length64_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length64_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -3294,18 +1940,6 @@ define i1 @length64_gt(i8* %x, i8* %y) nounwind { } define i1 @length64_eq_const(i8* %X) nounwind { -; X86-LABEL: length64_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $64 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length64_eq_const: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -3386,16 +2020,6 @@ define i1 @length64_eq_const(i8* %X) nounwind { } define i32 @length96(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length96: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $96 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length96: ; X64: # %bb.0: ; X64-NEXT: movl $96, %edx @@ -3405,18 +2029,6 @@ define i32 @length96(i8* %X, i8* %Y) nounwind { } define i1 @length96_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length96_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $96 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length96_eq: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -3498,18 +2110,6 @@ define i1 @length96_eq(i8* %x, i8* %y) nounwind { } define i1 @length96_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length96_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $96 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length96_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -3525,18 +2125,6 @@ define i1 @length96_lt(i8* %x, i8* %y) nounwind { } define i1 @length96_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length96_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $96 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length96_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -3552,18 +2140,6 @@ define i1 @length96_gt(i8* %x, i8* %y) nounwind { } define i1 @length96_eq_const(i8* %X) nounwind { -; X86-LABEL: length96_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $96 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length96_eq_const: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -3646,16 +2222,6 @@ define i1 @length96_eq_const(i8* %X) nounwind { } define i32 @length127(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length127: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $127 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length127: ; X64: # %bb.0: ; X64-NEXT: movl $127, %edx @@ -3665,18 +2231,6 @@ define i32 @length127(i8* %X, i8* %Y) nounwind { } define i1 @length127_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length127_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $127 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length127_eq: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -3755,18 +2309,6 @@ define i1 @length127_eq(i8* %x, i8* %y) nounwind { } define i1 @length127_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length127_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $127 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length127_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -3782,18 +2324,6 @@ define i1 @length127_lt(i8* %x, i8* %y) nounwind { } define i1 @length127_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length127_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $127 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length127_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -3809,18 +2339,6 @@ define i1 @length127_gt(i8* %x, i8* %y) nounwind { } define i1 @length127_eq_const(i8* %X) nounwind { -; X86-LABEL: length127_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $127 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length127_eq_const: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -3903,16 +2421,6 @@ define i1 @length127_eq_const(i8* %X) nounwind { } define i32 @length128(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length128: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $128 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length128: ; X64: # %bb.0: ; X64-NEXT: movl $128, %edx @@ -3922,18 +2430,6 @@ define i32 @length128(i8* %X, i8* %Y) nounwind { } define i1 @length128_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length128_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $128 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length128_eq: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -4012,18 +2508,6 @@ define i1 @length128_eq(i8* %x, i8* %y) nounwind { } define i1 @length128_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length128_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $128 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length128_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4039,18 +2523,6 @@ define i1 @length128_lt(i8* %x, i8* %y) nounwind { } define i1 @length128_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length128_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $128 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length128_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4066,18 +2538,6 @@ define i1 @length128_gt(i8* %x, i8* %y) nounwind { } define i1 @length128_eq_const(i8* %X) nounwind { -; X86-LABEL: length128_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $128 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-SSE-LABEL: length128_eq_const: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: pushq %rax @@ -4160,16 +2620,6 @@ define i1 @length128_eq_const(i8* %X) nounwind { } define i32 @length192(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length192: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $192 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length192: ; X64: # %bb.0: ; X64-NEXT: movl $192, %edx @@ -4179,18 +2629,6 @@ define i32 @length192(i8* %X, i8* %Y) nounwind { } define i1 @length192_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length192_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $192 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length192_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4206,18 +2644,6 @@ define i1 @length192_eq(i8* %x, i8* %y) nounwind { } define i1 @length192_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length192_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $192 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length192_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4233,18 +2659,6 @@ define i1 @length192_lt(i8* %x, i8* %y) nounwind { } define i1 @length192_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length192_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $192 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length192_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4260,18 +2674,6 @@ define i1 @length192_gt(i8* %x, i8* %y) nounwind { } define i1 @length192_eq_const(i8* %X) nounwind { -; X86-LABEL: length192_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $192 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length192_eq_const: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4288,16 +2690,6 @@ define i1 @length192_eq_const(i8* %X) nounwind { } define i32 @length255(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length255: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $255 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length255: ; X64: # %bb.0: ; X64-NEXT: movl $255, %edx @@ -4307,18 +2699,6 @@ define i32 @length255(i8* %X, i8* %Y) nounwind { } define i1 @length255_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length255_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $255 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length255_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4334,18 +2714,6 @@ define i1 @length255_eq(i8* %x, i8* %y) nounwind { } define i1 @length255_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length255_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $255 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length255_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4361,18 +2729,6 @@ define i1 @length255_lt(i8* %x, i8* %y) nounwind { } define i1 @length255_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length255_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $255 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length255_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4388,18 +2744,6 @@ define i1 @length255_gt(i8* %x, i8* %y) nounwind { } define i1 @length255_eq_const(i8* %X) nounwind { -; X86-LABEL: length255_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $255 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length255_eq_const: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4416,16 +2760,6 @@ define i1 @length255_eq_const(i8* %X) nounwind { } define i32 @length256(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length256: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $256 # imm = 0x100 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length256: ; X64: # %bb.0: ; X64-NEXT: movl $256, %edx # imm = 0x100 @@ -4435,18 +2769,6 @@ define i32 @length256(i8* %X, i8* %Y) nounwind { } define i1 @length256_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length256_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $256 # imm = 0x100 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length256_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4462,18 +2784,6 @@ define i1 @length256_eq(i8* %x, i8* %y) nounwind { } define i1 @length256_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length256_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $256 # imm = 0x100 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length256_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4489,18 +2799,6 @@ define i1 @length256_lt(i8* %x, i8* %y) nounwind { } define i1 @length256_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length256_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $256 # imm = 0x100 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length256_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4516,18 +2814,6 @@ define i1 @length256_gt(i8* %x, i8* %y) nounwind { } define i1 @length256_eq_const(i8* %X) nounwind { -; X86-LABEL: length256_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $256 # imm = 0x100 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length256_eq_const: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4544,16 +2830,6 @@ define i1 @length256_eq_const(i8* %X) nounwind { } define i32 @length384(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length384: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $384 # imm = 0x180 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length384: ; X64: # %bb.0: ; X64-NEXT: movl $384, %edx # imm = 0x180 @@ -4563,18 +2839,6 @@ define i32 @length384(i8* %X, i8* %Y) nounwind { } define i1 @length384_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length384_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $384 # imm = 0x180 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length384_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4590,18 +2854,6 @@ define i1 @length384_eq(i8* %x, i8* %y) nounwind { } define i1 @length384_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length384_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $384 # imm = 0x180 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length384_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4617,18 +2869,6 @@ define i1 @length384_lt(i8* %x, i8* %y) nounwind { } define i1 @length384_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length384_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $384 # imm = 0x180 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length384_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4644,18 +2884,6 @@ define i1 @length384_gt(i8* %x, i8* %y) nounwind { } define i1 @length384_eq_const(i8* %X) nounwind { -; X86-LABEL: length384_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $384 # imm = 0x180 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length384_eq_const: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4672,16 +2900,6 @@ define i1 @length384_eq_const(i8* %X) nounwind { } define i32 @length511(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length511: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $511 # imm = 0x1FF -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length511: ; X64: # %bb.0: ; X64-NEXT: movl $511, %edx # imm = 0x1FF @@ -4691,18 +2909,6 @@ define i32 @length511(i8* %X, i8* %Y) nounwind { } define i1 @length511_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length511_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $511 # imm = 0x1FF -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length511_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4718,18 +2924,6 @@ define i1 @length511_eq(i8* %x, i8* %y) nounwind { } define i1 @length511_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length511_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $511 # imm = 0x1FF -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length511_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4745,18 +2939,6 @@ define i1 @length511_lt(i8* %x, i8* %y) nounwind { } define i1 @length511_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length511_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $511 # imm = 0x1FF -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length511_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4772,18 +2954,6 @@ define i1 @length511_gt(i8* %x, i8* %y) nounwind { } define i1 @length511_eq_const(i8* %X) nounwind { -; X86-LABEL: length511_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $511 # imm = 0x1FF -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length511_eq_const: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4800,16 +2970,6 @@ define i1 @length511_eq_const(i8* %X) nounwind { } define i32 @length512(i8* %X, i8* %Y) nounwind { -; X86-LABEL: length512: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $512 # imm = 0x200 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: length512: ; X64: # %bb.0: ; X64-NEXT: movl $512, %edx # imm = 0x200 @@ -4819,18 +2979,6 @@ define i32 @length512(i8* %X, i8* %Y) nounwind { } define i1 @length512_eq(i8* %x, i8* %y) nounwind { -; X86-LABEL: length512_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $512 # imm = 0x200 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %al -; X86-NEXT: retl -; ; X64-LABEL: length512_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4846,18 +2994,6 @@ define i1 @length512_eq(i8* %x, i8* %y) nounwind { } define i1 @length512_lt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length512_lt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $512 # imm = 0x200 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: shrl $31, %eax -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: retl -; ; X64-LABEL: length512_lt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4873,18 +3009,6 @@ define i1 @length512_lt(i8* %x, i8* %y) nounwind { } define i1 @length512_gt(i8* %x, i8* %y) nounwind { -; X86-LABEL: length512_gt: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $512 # imm = 0x200 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: setg %al -; X86-NEXT: retl -; ; X64-LABEL: length512_gt: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4900,18 +3024,6 @@ define i1 @length512_gt(i8* %x, i8* %y) nounwind { } define i1 @length512_eq_const(i8* %X) nounwind { -; X86-LABEL: length512_eq_const: -; X86: # %bb.0: -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $512 # imm = 0x200 -; X86-NEXT: pushl $.L.str -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: length512_eq_const: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4929,16 +3041,6 @@ define i1 @length512_eq_const(i8* %X) nounwind { ; This checks that we do not do stupid things with huge sizes. define i32 @huge_length(i8* %X, i8* %Y) nounwind { -; X86-LABEL: huge_length: -; X86: # %bb.0: -; X86-NEXT: pushl $2147483647 # imm = 0x7FFFFFFF -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: retl -; ; X64-LABEL: huge_length: ; X64: # %bb.0: ; X64-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF @@ -4948,18 +3050,6 @@ define i32 @huge_length(i8* %X, i8* %Y) nounwind { } define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind { -; X86-LABEL: huge_length_eq: -; X86: # %bb.0: -; X86-NEXT: pushl $2147483647 # imm = 0x7FFFFFFF -; X86-NEXT: pushl $-1 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: huge_length_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax @@ -4976,10 +3066,6 @@ define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind { ; This checks non-constant sizes. define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind { -; X86-LABEL: nonconst_length: -; X86: # %bb.0: -; X86-NEXT: jmp memcmp # TAILCALL -; ; X64-LABEL: nonconst_length: ; X64: # %bb.0: ; X64-NEXT: jmp memcmp # TAILCALL @@ -4988,18 +3074,6 @@ define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind { } define i1 @nonconst_length_eq(i8* %X, i8* %Y, i64 %size) nounwind { -; X86-LABEL: nonconst_length_eq: -; X86: # %bb.0: -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll memcmp -; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: sete %al -; X86-NEXT: retl -; ; X64-LABEL: nonconst_length_eq: ; X64: # %bb.0: ; X64-NEXT: pushq %rax diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll new file mode 100644 index 0000000000000..3c69841fbfc8e --- /dev/null +++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll @@ -0,0 +1,615 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=X32 + +declare i32 @memcmp(i8* nocapture, i8* nocapture, i32) + +define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp2( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X32-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1 +; X32-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1 +; X32-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) +; X32-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; X32-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 +; X32-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] +; X32-NEXT: ret i32 [[TMP9]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 2) + ret i32 %call +} + +define i32 @cmp2_align2(i8* nocapture readonly align 2 %x, i8* nocapture readonly align 2 %y) { +; X32-LABEL: @cmp2_align2( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X32-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 2 +; X32-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2 +; X32-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) +; X32-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; X32-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 +; X32-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] +; X32-NEXT: ret i32 [[TMP9]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 2) + ret i32 %call +} + +define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp3( +; X32-NEXT: br label [[LOADBB:%.*]] +; X32: res_block: +; X32-NEXT: [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]] +; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X32-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 1 +; X32-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]], align 1 +; X32-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) +; X32-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]]) +; X32-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]] +; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 2 +; X32-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 2 +; X32-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]], align 1 +; X32-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]], align 1 +; X32-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X32-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X32-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 3) + ret i32 %call +} + +define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp4( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 +; X32-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X32-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) +; X32-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] +; X32-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] +; X32-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 +; X32-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 +; X32-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; X32-NEXT: ret i32 [[TMP11]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 4) + ret i32 %call +} + +define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp5( +; X32-NEXT: br label [[LOADBB:%.*]] +; X32: res_block: +; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]] +; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1 +; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1 +; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4 +; X32-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4 +; X32-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]], align 1 +; X32-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]], align 1 +; X32-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X32-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X32-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; X32-NEXT: br label [[ENDBLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 5) + ret i32 %call +} + +define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp6( +; X32-NEXT: br label [[LOADBB:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1 +; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1 +; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4 +; X32-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4 +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i16* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i16* +; X32-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]], align 1 +; X32-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]], align 1 +; X32-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X32-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; X32-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 +; X32-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32 +; X32-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]] +; X32-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 6) + ret i32 %call +} + +define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp7( +; X32-NEXT: br label [[LOADBB:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1 +; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1 +; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 3 +; X32-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 3 +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32* +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]], align 1 +; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 1 +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) +; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 7) + ret i32 %call +} + +define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp8( +; X32-NEXT: br label [[LOADBB:%.*]] +; X32: res_block: +; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] +; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X32-NEXT: br label [[ENDBLOCK:%.*]] +; X32: loadbb: +; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1 +; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1 +; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X32: loadbb1: +; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4 +; X32-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4 +; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32* +; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]], align 1 +; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 1 +; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) +; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]] +; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X32: endblock: +; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X32-NEXT: ret i32 [[PHI_RES]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 8) + ret i32 %call +} + +define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp9( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 9) +; X32-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 9) + ret i32 %call +} + +define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp10( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 10) +; X32-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 10) + ret i32 %call +} + +define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp11( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 11) +; X32-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 11) + ret i32 %call +} + +define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp12( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 12) +; X32-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 12) + ret i32 %call +} + +define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp13( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 13) +; X32-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 13) + ret i32 %call +} + +define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp14( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 14) +; X32-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 14) + ret i32 %call +} + +define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp15( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 15) +; X32-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 15) + ret i32 %call +} + +define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp16( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 16) +; X32-NEXT: ret i32 [[CALL]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) + ret i32 %call +} + +define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq2( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X32-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1 +; X32-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1 +; X32-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] +; X32-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 2) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq3( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X32-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1 +; X32-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1 +; X32-NEXT: [[TMP5:%.*]] = xor i16 [[TMP3]], [[TMP4]] +; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 2 +; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 2 +; X32-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]], align 1 +; X32-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]], align 1 +; X32-NEXT: [[TMP10:%.*]] = zext i8 [[TMP8]] to i16 +; X32-NEXT: [[TMP11:%.*]] = zext i8 [[TMP9]] to i16 +; X32-NEXT: [[TMP12:%.*]] = xor i16 [[TMP10]], [[TMP11]] +; X32-NEXT: [[TMP13:%.*]] = or i16 [[TMP5]], [[TMP12]] +; X32-NEXT: [[TMP14:%.*]] = icmp ne i16 [[TMP13]], 0 +; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 3) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq4( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 +; X32-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; X32-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 4) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq5( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 +; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] +; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4 +; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4 +; X32-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]], align 1 +; X32-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]], align 1 +; X32-NEXT: [[TMP10:%.*]] = zext i8 [[TMP8]] to i32 +; X32-NEXT: [[TMP11:%.*]] = zext i8 [[TMP9]] to i32 +; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] +; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] +; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 5) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq6( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 +; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] +; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4 +; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4 +; X32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16* +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16* +; X32-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]], align 1 +; X32-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]], align 1 +; X32-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32 +; X32-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32 +; X32-NEXT: [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]] +; X32-NEXT: [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]] +; X32-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; X32-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 6) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq6_align4(i8* nocapture readonly align 4 %x, i8* nocapture readonly align 4 %y) { +; X32-LABEL: @cmp_eq6_align4( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 +; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] +; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4 +; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4 +; X32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16* +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16* +; X32-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]], align 4 +; X32-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]], align 4 +; X32-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32 +; X32-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32 +; X32-NEXT: [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]] +; X32-NEXT: [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]] +; X32-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 +; X32-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 6) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq7( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 +; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] +; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 3 +; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 3 +; X32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32* +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32* +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]], align 1 +; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 1 +; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] +; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] +; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 7) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq8( +; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 +; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 +; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] +; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4 +; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4 +; X32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32* +; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32* +; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]], align 1 +; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 1 +; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] +; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] +; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 8) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq9( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 9) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 9) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq10( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 10) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 10) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq11( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 11) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 11) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq12( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 12) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 12) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq13( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 13) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 13) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq14( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 14) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 14) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq15( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 15) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 15) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) { +; X32-LABEL: @cmp_eq16( +; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 16) +; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X32-NEXT: ret i32 [[CONV]] +; + %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) + %cmp = icmp eq i32 %call, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll index df05021473585..27b3ce4f04b27 100644 --- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll +++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll @@ -1,238 +1,205 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32 -; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_1LD -; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_2LD +; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD +; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128 < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_2LD declare i32 @memcmp(i8* nocapture, i8* nocapture, i64) define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp2( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1 -; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1 -; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) -; ALL-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 -; ALL-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 -; ALL-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] -; ALL-NEXT: ret i32 [[TMP9]] +; X64-LABEL: @cmp2( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X64-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1 +; X64-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1 +; X64-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) +; X64-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; X64-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 +; X64-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] +; X64-NEXT: ret i32 [[TMP9]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) ret i32 %call } define i32 @cmp2_align2(i8* nocapture readonly align 2 %x, i8* nocapture readonly align 2 %y) { -; ALL-LABEL: @cmp2_align2( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 2 -; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2 -; ALL-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) -; ALL-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 -; ALL-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 -; ALL-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] -; ALL-NEXT: ret i32 [[TMP9]] +; X64-LABEL: @cmp2_align2( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X64-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 2 +; X64-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2 +; X64-NEXT: [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]]) +; X64-NEXT: [[TMP7:%.*]] = zext i16 [[TMP5]] to i32 +; X64-NEXT: [[TMP8:%.*]] = zext i16 [[TMP6]] to i32 +; X64-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]] +; X64-NEXT: ret i32 [[TMP9]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) ret i32 %call } define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp3( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 1 -; ALL-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]], align 1 -; ALL-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) -; ALL-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]]) -; ALL-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 2 -; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 2 -; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]], align 1 -; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]], align 1 -; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 -; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] -; ALL-NEXT: br label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] +; X64-LABEL: @cmp3( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X64-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 1 +; X64-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP4]], align 1 +; X64-NEXT: [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 2 +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 2 +; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]], align 1 +; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]], align 1 +; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X64-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3) ret i32 %call } define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp4( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 -; ALL-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) -; ALL-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) -; ALL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] -; ALL-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] -; ALL-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 -; ALL-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 -; ALL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] -; ALL-NEXT: ret i32 [[TMP11]] +; X64-LABEL: @cmp4( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 +; X64-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 +; X64-NEXT: [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]]) +; X64-NEXT: [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]]) +; X64-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]] +; X64-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]] +; X64-NEXT: [[TMP9:%.*]] = zext i1 [[TMP7]] to i32 +; X64-NEXT: [[TMP10:%.*]] = zext i1 [[TMP8]] to i32 +; X64-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]] +; X64-NEXT: ret i32 [[TMP11]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) ret i32 %call } define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp5( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1 -; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1 -; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) -; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) -; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4 -; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4 -; ALL-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]], align 1 -; ALL-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]], align 1 -; ALL-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 -; ALL-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 -; ALL-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] -; ALL-NEXT: br label [[ENDBLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] +; X64-LABEL: @cmp5( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1 +; X64-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1 +; X64-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4 +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4 +; X64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP10]], align 1 +; X64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP11]], align 1 +; X64-NEXT: [[TMP14:%.*]] = zext i8 [[TMP12]] to i32 +; X64-NEXT: [[TMP15:%.*]] = zext i8 [[TMP13]] to i32 +; X64-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]] +; X64-NEXT: br label [[ENDBLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5) ret i32 %call } define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp6( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] -; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] -; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1 -; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1 -; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) -; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) -; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4 -; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4 -; ALL-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i16* -; ALL-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i16* -; ALL-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]], align 1 -; ALL-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]], align 1 -; ALL-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) -; ALL-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) -; ALL-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 -; ALL-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32 -; ALL-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]] -; ALL-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] +; X64-LABEL: @cmp6( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1 +; X64-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1 +; X64-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4 +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4 +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i16* +; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i16* +; X64-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP12]], align 1 +; X64-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP13]], align 1 +; X64-NEXT: [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]]) +; X64-NEXT: [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]]) +; X64-NEXT: [[TMP18]] = zext i16 [[TMP16]] to i32 +; X64-NEXT: [[TMP19]] = zext i16 [[TMP17]] to i32 +; X64-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]] +; X64-NEXT: br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6) ret i32 %call } define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp7( -; ALL-NEXT: br label [[LOADBB:%.*]] -; ALL: res_block: -; ALL-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] -; ALL-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; ALL-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; ALL-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; ALL-NEXT: br label [[ENDBLOCK:%.*]] -; ALL: loadbb: -; ALL-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] -; ALL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] -; ALL-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) -; ALL-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) -; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] -; ALL-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] -; ALL: loadbb1: -; ALL-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 3 -; ALL-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 3 -; ALL-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -; ALL-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32* -; ALL-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]] -; ALL-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]] -; ALL-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; ALL-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) -; ALL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]] -; ALL-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; ALL: endblock: -; ALL-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; ALL-NEXT: ret i32 [[PHI_RES]] +; X64-LABEL: @cmp7( +; X64-NEXT: br label [[LOADBB:%.*]] +; X64: res_block: +; X64-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] +; X64-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] +; X64-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] +; X64-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 +; X64-NEXT: br label [[ENDBLOCK:%.*]] +; X64: loadbb: +; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1 +; X64-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1 +; X64-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) +; X64-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) +; X64-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] +; X64-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] +; X64: loadbb1: +; X64-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 3 +; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 3 +; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32* +; X64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]], align 1 +; X64-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 1 +; X64-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) +; X64-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) +; X64-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]] +; X64-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] +; X64: endblock: +; X64-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] +; X64-NEXT: ret i32 [[PHI_RES]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7) ret i32 %call } define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp8( -; X32-NEXT: br label [[LOADBB:%.*]] -; X32: res_block: -; X32-NEXT: [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ] -; X32-NEXT: [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ] -; X32-NEXT: [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]] -; X32-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1 -; X32-NEXT: br label [[ENDBLOCK:%.*]] -; X32: loadbb: -; X32-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1 -; X32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1 -; X32-NEXT: [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]]) -; X32-NEXT: [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]]) -; X32-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]] -; X32-NEXT: br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]] -; X32: loadbb1: -; X32-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4 -; X32-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4 -; X32-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* -; X32-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32* -; X32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]], align 1 -; X32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 1 -; X32-NEXT: [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]]) -; X32-NEXT: [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]]) -; X32-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]] -; X32-NEXT: br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]] -; X32: endblock: -; X32-NEXT: [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ] -; X32-NEXT: ret i32 [[PHI_RES]] -; ; X64-LABEL: @cmp8( ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* ; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* @@ -252,10 +219,6 @@ define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp9( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; X32-NEXT: ret i32 [[CALL]] -; ; X64-LABEL: @cmp9( ; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: @@ -289,10 +252,6 @@ define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp10( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; X32-NEXT: ret i32 [[CALL]] -; ; X64-LABEL: @cmp10( ; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: @@ -332,10 +291,6 @@ define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp11( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; X32-NEXT: ret i32 [[CALL]] -; ; X64-LABEL: @cmp11( ; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: @@ -347,8 +302,8 @@ define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64: loadbb: ; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* ; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] -; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]], align 1 +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]], align 1 ; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) ; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) ; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] @@ -358,8 +313,8 @@ define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 3 ; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64* ; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64* -; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]] +; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]], align 1 +; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]], align 1 ; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) ; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]]) ; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]] @@ -373,10 +328,6 @@ define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp12( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; X32-NEXT: ret i32 [[CALL]] -; ; X64-LABEL: @cmp12( ; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: @@ -416,10 +367,6 @@ define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp13( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; X32-NEXT: ret i32 [[CALL]] -; ; X64-LABEL: @cmp13( ; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: @@ -431,8 +378,8 @@ define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64: loadbb: ; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* ; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] -; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]], align 1 +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]], align 1 ; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) ; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) ; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] @@ -442,8 +389,8 @@ define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 5 ; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64* ; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64* -; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]] +; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]], align 1 +; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]], align 1 ; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) ; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]]) ; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]] @@ -457,10 +404,6 @@ define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp14( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; X32-NEXT: ret i32 [[CALL]] -; ; X64-LABEL: @cmp14( ; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: @@ -472,8 +415,8 @@ define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64: loadbb: ; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* ; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] -; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]], align 1 +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]], align 1 ; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) ; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) ; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] @@ -483,8 +426,8 @@ define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 6 ; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64* ; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64* -; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]] +; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]], align 1 +; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]], align 1 ; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) ; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]]) ; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]] @@ -498,10 +441,6 @@ define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp15( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; X32-NEXT: ret i32 [[CALL]] -; ; X64-LABEL: @cmp15( ; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: @@ -513,8 +452,8 @@ define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64: loadbb: ; X64-NEXT: [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64* ; X64-NEXT: [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64* -; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]] -; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]] +; X64-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP3]], align 1 +; X64-NEXT: [[TMP6:%.*]] = load i64, i64* [[TMP4]], align 1 ; X64-NEXT: [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]]) ; X64-NEXT: [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]]) ; X64-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]] @@ -524,8 +463,8 @@ define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; X64-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 7 ; X64-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64* ; X64-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64* -; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]] -; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]] +; X64-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP12]], align 1 +; X64-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP13]], align 1 ; X64-NEXT: [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]]) ; X64-NEXT: [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]]) ; X64-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]] @@ -539,10 +478,6 @@ define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp16( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; X32-NEXT: ret i32 [[CALL]] -; ; X64-LABEL: @cmp16( ; X64-NEXT: br label [[LOADBB:%.*]] ; X64: res_block: @@ -580,16 +515,16 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq2( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; ALL-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1 -; ALL-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1 -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] -; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X64-LABEL: @cmp_eq2( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* +; X64-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1 +; X64-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1 +; X64-NEXT: [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]] +; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2) %cmp = icmp eq i32 %call, 0 @@ -598,26 +533,6 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq3( -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16* -; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16* -; X32-NEXT: [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1 -; X32-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1 -; X32-NEXT: [[TMP5:%.*]] = xor i16 [[TMP3]], [[TMP4]] -; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 2 -; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 2 -; X32-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]], align 1 -; X32-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]], align 1 -; X32-NEXT: [[TMP10:%.*]] = zext i8 [[TMP8]] to i16 -; X32-NEXT: [[TMP11:%.*]] = zext i8 [[TMP9]] to i16 -; X32-NEXT: [[TMP12:%.*]] = xor i16 [[TMP10]], [[TMP11]] -; X32-NEXT: [[TMP13:%.*]] = or i16 [[TMP5]], [[TMP12]] -; X32-NEXT: [[TMP14:%.*]] = icmp ne i16 [[TMP13]], 0 -; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64_1LD-LABEL: @cmp_eq3( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -669,16 +584,16 @@ define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; ALL-LABEL: @cmp_eq4( -; ALL-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; ALL-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; ALL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 -; ALL-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 -; ALL-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] -; ALL-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 -; ALL-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 -; ALL-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; ALL-NEXT: ret i32 [[CONV]] +; X64-LABEL: @cmp_eq4( +; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* +; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* +; X64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 +; X64-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 +; X64-NEXT: [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]] +; X64-NEXT: [[TMP6:%.*]] = zext i1 [[TMP5]] to i32 +; X64-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0 +; X64-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; X64-NEXT: ret i32 [[CONV]] ; %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4) %cmp = icmp eq i32 %call, 0 @@ -687,26 +602,6 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq5( -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 -; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 -; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4 -; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4 -; X32-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP6]], align 1 -; X32-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP7]], align 1 -; X32-NEXT: [[TMP10:%.*]] = zext i8 [[TMP8]] to i32 -; X32-NEXT: [[TMP11:%.*]] = zext i8 [[TMP9]] to i32 -; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] -; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] -; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64_1LD-LABEL: @cmp_eq5( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -758,28 +653,6 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq6( -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 -; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 -; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4 -; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4 -; X32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16* -; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16* -; X32-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]], align 1 -; X32-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]], align 1 -; X32-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32 -; X32-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32 -; X32-NEXT: [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]] -; X32-NEXT: [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]] -; X32-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 -; X32-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32 -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64_1LD-LABEL: @cmp_eq6( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -835,28 +708,6 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq6_align4(i8* nocapture readonly align 4 %x, i8* nocapture readonly align 4 %y) { -; X32-LABEL: @cmp_eq6_align4( -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4 -; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4 -; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4 -; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4 -; X32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16* -; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16* -; X32-NEXT: [[TMP10:%.*]] = load i16, i16* [[TMP8]], align 4 -; X32-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP9]], align 4 -; X32-NEXT: [[TMP12:%.*]] = zext i16 [[TMP10]] to i32 -; X32-NEXT: [[TMP13:%.*]] = zext i16 [[TMP11]] to i32 -; X32-NEXT: [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]] -; X32-NEXT: [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]] -; X32-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0 -; X32-NEXT: [[TMP17:%.*]] = zext i1 [[TMP16]] to i32 -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64_1LD-LABEL: @cmp_eq6_align4( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -912,26 +763,6 @@ define i32 @cmp_eq6_align4(i8* nocapture readonly align 4 %x, i8* nocapture read } define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq7( -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 -; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 -; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 3 -; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 3 -; X32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32* -; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32* -; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]], align 1 -; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 1 -; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] -; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] -; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64_1LD-LABEL: @cmp_eq7( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -985,26 +816,6 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq8( -; X32-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32* -; X32-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32* -; X32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1 -; X32-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1 -; X32-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] -; X32-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4 -; X32-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4 -; X32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32* -; X32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32* -; X32-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]], align 1 -; X32-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 1 -; X32-NEXT: [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]] -; X32-NEXT: [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]] -; X32-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 -; X32-NEXT: [[TMP15:%.*]] = zext i1 [[TMP14]] to i32 -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64-LABEL: @cmp_eq8( ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64* ; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64* @@ -1023,12 +834,6 @@ define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq9( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64_1LD-LABEL: @cmp_eq9( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -1080,12 +885,6 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq10( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64_1LD-LABEL: @cmp_eq10( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -1141,12 +940,6 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq11( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64_1LD-LABEL: @cmp_eq11( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -1200,12 +993,6 @@ define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq12( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64_1LD-LABEL: @cmp_eq12( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -1261,12 +1048,6 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq13( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64_1LD-LABEL: @cmp_eq13( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -1320,12 +1101,6 @@ define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq14( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64_1LD-LABEL: @cmp_eq14( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -1379,12 +1154,6 @@ define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq15( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64_1LD-LABEL: @cmp_eq15( ; X64_1LD-NEXT: br label [[LOADBB:%.*]] ; X64_1LD: res_block: @@ -1438,12 +1207,6 @@ define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y) { } define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y) { -; X32-LABEL: @cmp_eq16( -; X32-NEXT: [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16) -; X32-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 -; X32-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 -; X32-NEXT: ret i32 [[CONV]] -; ; X64-LABEL: @cmp_eq16( ; X64-NEXT: [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i128* ; X64-NEXT: [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i128* From e5b15c01817da62f4b224f7554d5c84daae80d5f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 15 Aug 2021 11:32:28 -0700 Subject: [PATCH 076/700] [X86] Add some tests to show incorrect commuting of vcmpsh instructions. --- .../CodeGen/X86/select-of-half-constants.ll | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 llvm/test/CodeGen/X86/select-of-half-constants.ll diff --git a/llvm/test/CodeGen/X86/select-of-half-constants.ll b/llvm/test/CodeGen/X86/select-of-half-constants.ll new file mode 100644 index 0000000000000..e2a2190a1b72b --- /dev/null +++ b/llvm/test/CodeGen/X86/select-of-half-constants.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X64-AVX512FP16 + +; This should do a single load into the fp stack for the return, not diddle with xmm registers. + +define half @fcmp_select_fp_constants_olt(half %x) nounwind readnone { +; X64-AVX512FP16-LABEL: fcmp_select_fp_constants_olt: +; X64-AVX512FP16: # %bb.0: +; X64-AVX512FP16-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-AVX512FP16-NEXT: vcmpltsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; X64-AVX512FP16-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-AVX512FP16-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} +; X64-AVX512FP16-NEXT: retq + %c = fcmp olt half %x, -4.0 + %r = select i1 %c, half 42.0, half 23.0 + ret half %r +} + +; FIXME: This should be vcmpgtsh not vcmpltsh. +define half @fcmp_select_fp_constants_ogt(half %x) nounwind readnone { +; X64-AVX512FP16-LABEL: fcmp_select_fp_constants_ogt: +; X64-AVX512FP16: # %bb.0: +; X64-AVX512FP16-NEXT: vcmpltsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; X64-AVX512FP16-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-AVX512FP16-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-AVX512FP16-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} +; X64-AVX512FP16-NEXT: retq + %c = fcmp ogt half %x, -4.0 + %r = select i1 %c, half 42.0, half 23.0 + ret half %r +} + From 786b8fcc9b1eed093926e6d9c36891ab72b7bde3 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 15 Aug 2021 11:35:20 -0700 Subject: [PATCH 077/700] [X86] Add vcmpsh/vcmpph to X86InstrInfo::commuteInstructionImpl. They were already added to findCommuteOpIndices, but they also need to be in X86InstrInfo::commuteInstructionImpl in order to adjust the immediate control. --- llvm/lib/Target/X86/X86InstrInfo.cpp | 4 ++++ llvm/test/CodeGen/X86/select-of-half-constants.ll | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 38a18fddac0ba..ab5a34181cc3a 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2241,6 +2241,10 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case X86::VCMPSSZrr: case X86::VCMPPDZrri: case X86::VCMPPSZrri: + case X86::VCMPSHZrr: + case X86::VCMPPHZrri: + case X86::VCMPPHZ128rri: + case X86::VCMPPHZ256rri: case X86::VCMPPDZ128rri: case X86::VCMPPSZ128rri: case X86::VCMPPDZ256rri: diff --git a/llvm/test/CodeGen/X86/select-of-half-constants.ll b/llvm/test/CodeGen/X86/select-of-half-constants.ll index e2a2190a1b72b..e74e083f8cfe5 100644 --- a/llvm/test/CodeGen/X86/select-of-half-constants.ll +++ b/llvm/test/CodeGen/X86/select-of-half-constants.ll @@ -16,11 +16,10 @@ define half @fcmp_select_fp_constants_olt(half %x) nounwind readnone { ret half %r } -; FIXME: This should be vcmpgtsh not vcmpltsh. define half @fcmp_select_fp_constants_ogt(half %x) nounwind readnone { ; X64-AVX512FP16-LABEL: fcmp_select_fp_constants_ogt: ; X64-AVX512FP16: # %bb.0: -; X64-AVX512FP16-NEXT: vcmpltsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 +; X64-AVX512FP16-NEXT: vcmpgtsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 ; X64-AVX512FP16-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-AVX512FP16-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-AVX512FP16-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} From 62a4c2c10e8eadd5bab49be72d642db257b46100 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Sat, 14 Aug 2021 12:46:08 -0700 Subject: [PATCH 078/700] DWARFVerifier: Check section-relative references at the end of the section This ensures that debug_types references aren't looked for in debug_info section. Behavior is still going to be questionable in an unlinked object file - since cross-cu references could refer to symbols in another .debug_info (or, in theory, .debug_types) chunk - but if a producer only uses ref_addr to refer to things within the same .debug_info chunk in an object file (eg: whole program optimization/LTO - producing two CUs into a single .debug_info section in an object file - the ref_addrs there could be resolved relative to that .debug_info chunk, not needing to consider comdat (DWARFv5 type units or other creatures) chunks of .debug_info, etc) --- .../llvm/DebugInfo/DWARF/DWARFVerifier.h | 17 +- llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp | 52 +++-- .../test/DebugInfo/X86/skeleton-unit-verify.s | 1 - .../X86/verify_invalid_ref_addr_between.yaml | 2 +- .../X86/verify_invalid_ref_multi_section.s | 193 ++++++++++++++++++ .../X86/verify_overlapping_cu_ranges.yaml | 2 +- 6 files changed, 243 insertions(+), 24 deletions(-) create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_multi_section.s diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h index 161a4f8f8f06a..5ab216598bb43 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h @@ -79,14 +79,11 @@ class DWARFVerifier { raw_ostream &OS; DWARFContext &DCtx; DIDumpOptions DumpOpts; - /// A map that tracks all references (converted absolute references) so we - /// can verify each reference points to a valid DIE and not an offset that - /// lies between to valid DIEs. - std::map> ReferenceToDIEOffsets; uint32_t NumDebugLineErrors = 0; // Used to relax some checks that do not currently work portably bool IsObjectFile; bool IsMachOObject; + using ReferenceMap = std::map>; raw_ostream &error() const; raw_ostream &warn() const; @@ -144,7 +141,9 @@ class DWARFVerifier { /// \param Unit The DWARF Unit to verify. /// /// \returns The number of errors that occurred during verification. - unsigned verifyUnitContents(DWARFUnit &Unit); + unsigned verifyUnitContents(DWARFUnit &Unit, + ReferenceMap &UnitLocalReferences, + ReferenceMap &CrossUnitReferences); /// Verifies the unit headers and contents in a .debug_info or .debug_types /// section. @@ -196,7 +195,9 @@ class DWARFVerifier { /// /// \returns NumErrors The number of errors occurred during verification of /// attributes' forms in a unit - unsigned verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue); + unsigned verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue, + ReferenceMap &UnitLocalReferences, + ReferenceMap &CrossUnitReferences); /// Verifies the all valid references that were found when iterating through /// all of the DIE attributes. @@ -208,7 +209,9 @@ class DWARFVerifier { /// /// \returns NumErrors The number of errors occurred during verification of /// references for the .debug_info and .debug_types sections - unsigned verifyDebugInfoReferences(); + unsigned verifyDebugInfoReferences( + const ReferenceMap &, + llvm::function_ref GetUnitForDieOffset); /// Verify the DW_AT_stmt_list encoding and value and ensure that no /// compile units that have the same DW_AT_stmt_list value. diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp index ac624ec8b80fb..c6e414a8eefe4 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -158,7 +158,9 @@ bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData, return Success; } -unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit) { +unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit, + ReferenceMap &UnitLocalReferences, + ReferenceMap &CrossUnitReferences) { unsigned NumUnitErrors = 0; unsigned NumDies = Unit.getNumDIEs(); for (unsigned I = 0; I < NumDies; ++I) { @@ -169,7 +171,8 @@ unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit) { for (auto AttrValue : Die.attributes()) { NumUnitErrors += verifyDebugInfoAttribute(Die, AttrValue); - NumUnitErrors += verifyDebugInfoForm(Die, AttrValue); + NumUnitErrors += verifyDebugInfoForm(Die, AttrValue, UnitLocalReferences, + CrossUnitReferences); } if (Die.hasChildren()) { @@ -299,6 +302,10 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S, bool hasDIE = DebugInfoData.isValidOffset(Offset); DWARFUnitVector TypeUnitVector; DWARFUnitVector CompileUnitVector; + /// A map that tracks all references (converted absolute references) so we + /// can verify each reference points to a valid DIE and not an offset that + /// lies between to valid DIEs. + ReferenceMap CrossUnitReferences; while (hasDIE) { OffsetStart = Offset; if (!verifyUnitHeader(DebugInfoData, &Offset, UnitIdx, UnitType, @@ -309,6 +316,7 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S, } else { DWARFUnitHeader Header; Header.extract(DCtx, DebugInfoData, &OffsetStart, SectionKind); + ReferenceMap UnitLocalReferences; DWARFUnit *Unit; switch (UnitType) { case dwarf::DW_UT_type: @@ -337,7 +345,10 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S, } default: { llvm_unreachable("Invalid UnitType."); } } - NumDebugInfoErrors += verifyUnitContents(*Unit); + NumDebugInfoErrors += + verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences); + NumDebugInfoErrors += verifyDebugInfoReferences( + UnitLocalReferences, [&](uint64_t Offset) { return Unit; }); } hasDIE = DebugInfoData.isValidOffset(Offset); ++UnitIdx; @@ -348,7 +359,14 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S, } if (!isHeaderChainValid) ++NumDebugInfoErrors; - NumDebugInfoErrors += verifyDebugInfoReferences(); + NumDebugInfoErrors += verifyDebugInfoReferences( + CrossUnitReferences, [&](uint64_t Offset) -> DWARFUnit * { + if (DWARFUnit *U = TypeUnitVector.getUnitForOffset(Offset)) + return U; + if (DWARFUnit *U = CompileUnitVector.getUnitForOffset(Offset)) + return U; + return nullptr; + }); return NumDebugInfoErrors; } @@ -587,7 +605,9 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, } unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, - DWARFAttribute &AttrValue) { + DWARFAttribute &AttrValue, + ReferenceMap &LocalReferences, + ReferenceMap &CrossUnitReferences) { const DWARFObject &DObj = DCtx.getDWARFObj(); auto DieCU = Die.getDwarfUnit(); unsigned NumErrors = 0; @@ -615,7 +635,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, } else { // Valid reference, but we will verify it points to an actual // DIE later. - ReferenceToDIEOffsets[*RefVal].insert(Die.getOffset()); + LocalReferences[*RefVal].insert(Die.getOffset()); } } break; @@ -634,7 +654,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, } else { // Valid reference, but we will verify it points to an actual // DIE later. - ReferenceToDIEOffsets[*RefVal].insert(Die.getOffset()); + CrossUnitReferences[*RefVal].insert(Die.getOffset()); } } break; @@ -694,20 +714,24 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, return NumErrors; } -unsigned DWARFVerifier::verifyDebugInfoReferences() { - // Take all references and make sure they point to an actual DIE by - // getting the DIE by offset and emitting an error - OS << "Verifying .debug_info references...\n"; +unsigned DWARFVerifier::verifyDebugInfoReferences( + const ReferenceMap &References, + llvm::function_ref GetUnitForOffset) { + auto GetDIEForOffset = [&](uint64_t Offset) { + if (DWARFUnit *U = GetUnitForOffset(Offset)) + return U->getDIEForOffset(Offset); + return DWARFDie(); + }; unsigned NumErrors = 0; for (const std::pair> &Pair : - ReferenceToDIEOffsets) { - if (DCtx.getDIEForOffset(Pair.first)) + References) { + if (GetDIEForOffset(Pair.first)) continue; ++NumErrors; error() << "invalid DIE reference " << format("0x%08" PRIx64, Pair.first) << ". Offset is in between DIEs:\n"; for (auto Offset : Pair.second) - dump(DCtx.getDIEForOffset(Offset)) << '\n'; + dump(GetDIEForOffset(Offset)) << '\n'; OS << "\n"; } return NumErrors; diff --git a/llvm/test/DebugInfo/X86/skeleton-unit-verify.s b/llvm/test/DebugInfo/X86/skeleton-unit-verify.s index 95fbd113942a4..a5911e93f4bc4 100644 --- a/llvm/test/DebugInfo/X86/skeleton-unit-verify.s +++ b/llvm/test/DebugInfo/X86/skeleton-unit-verify.s @@ -6,7 +6,6 @@ # CHECK-NEXT: warning: DW_TAG_skeleton_unit has DW_CHILDREN_yes but DIE has no children # CHECK-NEXT: DW_TAG_skeleton_unit # CHECK-NEXT: error: Skeleton compilation unit has children. -# CHECK-NEXT: Verifying .debug_info references... # CHECK-NEXT: Verifying .debug_types Unit Header Chain... # CHECK-NEXT: Errors detected. diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml index 605af01311af8..a412f7b879b8e 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml @@ -1,7 +1,7 @@ # RUN: yaml2obj %s -o %t.o # RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s -# CHECK: Verifying .debug_info references... +# CHECK: Verifying .debug_info # CHECK-NEXT: error: invalid DIE reference 0x00000011. Offset is in between DIEs: --- !ELF diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_multi_section.s b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_multi_section.s new file mode 100644 index 0000000000000..b1f94cfa65575 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_multi_section.s @@ -0,0 +1,193 @@ +# RUN: llvm-mc %s -o %t.o -filetype=obj +# RUN: llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s + +# CHECK-NOT: error: + +# Assembly generated from this source: +# struct t1 { int i; }; +# t1 v1; +# and compiled with -g -fdebug-types-section +# +# This demonstrates that llvm-dwarfdump --verify does not try to apply offsets found in the .debug_type + + .text + .file "test.cpp" + .file 1 "/usr/local/google/home/blaikie/dev/scratch" "test.cpp" + .section .debug_types,"G",@progbits,14297044602779165170,comdat + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .quad -4149699470930386446 # Type Signature + .long 30 # Type DIE Offset + .byte 1 # Abbrev [1] 0x17:0x25 DW_TAG_type_unit + .short 33 # DW_AT_language + .long .Lline_table_start0 # DW_AT_stmt_list + .byte 2 # Abbrev [2] 0x1e:0x16 DW_TAG_structure_type + .byte 5 # DW_AT_calling_convention + .long .Linfo_string6 # DW_AT_name + .byte 4 # DW_AT_byte_size + .byte 1 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .byte 3 # Abbrev [3] 0x27:0xc DW_TAG_member + .long .Linfo_string4 # DW_AT_name + .long 52 # DW_AT_type + .byte 1 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .byte 0 # DW_AT_data_member_location + .byte 0 # End Of Children Mark + .byte 4 # Abbrev [4] 0x34:0x7 DW_TAG_base_type + .long .Linfo_string5 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_end0: + .type v1,@object # @v1 + .bss + .globl v1 + .p2align 2 +v1: + .zero 4 + .size v1, 4 + + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 65 # DW_TAG_type_unit + .byte 1 # DW_CHILDREN_yes + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 19 # DW_TAG_structure_type + .byte 1 # DW_CHILDREN_yes + .byte 54 # DW_AT_calling_convention + .byte 11 # DW_FORM_data1 + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 13 # DW_TAG_member + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 56 # DW_AT_data_member_location + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 5 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 14 # DW_FORM_strp + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 14 # DW_FORM_strp + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 6 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 14 # DW_FORM_strp + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 7 # Abbreviation Code + .byte 19 # DW_TAG_structure_type + .byte 0 # DW_CHILDREN_no + .byte 60 # DW_AT_declaration + .byte 25 # DW_FORM_flag_present + .byte 105 # DW_AT_signature + .byte 32 # DW_FORM_ref_sig8 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit +.Ldebug_info_start1: + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 5 # Abbrev [5] 0xb:0x32 DW_TAG_compile_unit + .long .Linfo_string0 # DW_AT_producer + .short 33 # DW_AT_language + .long .Linfo_string1 # DW_AT_name + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Linfo_string2 # DW_AT_comp_dir + .byte 6 # Abbrev [6] 0x1e:0x15 DW_TAG_variable + .long .Linfo_string3 # DW_AT_name + .long 51 # DW_AT_type + # DW_AT_external + .byte 1 # DW_AT_decl_file + .byte 2 # DW_AT_decl_line + .byte 9 # DW_AT_location + .byte 3 + .quad v1 + .byte 7 # Abbrev [7] 0x33:0x9 DW_TAG_structure_type + # DW_AT_declaration + .quad -4149699470930386446 # DW_AT_signature + .byte 0 # End Of Children Mark +.Ldebug_info_end1: + .section .debug_str,"MS",@progbits,1 +.Linfo_string0: + .asciz "clang version 14.0.0 (git@github.com:llvm/llvm-project.git 7f00c7ce4b186ab8ba2ae66c82efdcf908c61019)" # string offset=0 +.Linfo_string1: + .asciz "test.cpp" # string offset=101 +.Linfo_string2: + .asciz "/usr/local/google/home/blaikie/dev/scratch" # string offset=110 +.Linfo_string3: + .asciz "v1" # string offset=153 +.Linfo_string4: + .asciz "i" # string offset=156 +.Linfo_string5: + .asciz "int" # string offset=158 +.Linfo_string6: + .asciz "t1" # string offset=162 + .ident "clang version 14.0.0 (git@github.com:llvm/llvm-project.git 7f00c7ce4b186ab8ba2ae66c82efdcf908c61019)" + .section ".note.GNU-stack","",@progbits + .addrsig + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_cu_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_cu_ranges.yaml index b0970cdac8b24..9533c2795783d 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_cu_ranges.yaml +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_cu_ranges.yaml @@ -64,7 +64,7 @@ # CHECK-NEXT: DW_AT_low_pc (0x0000000000000000) # CHECK-NEXT: DW_AT_high_pc (0x0000000000000020) -# CHECK: Verifying .debug_info references... +# CHECK: Verifying --- !mach-o FileHeader: From 819818f7d56110d81b984ac443eca8e9fb4ee176 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 15 Aug 2021 11:42:33 -0700 Subject: [PATCH 079/700] [X86] Modify the commuted load isel pattern for VCMPSHZrm to match VCMPSSZrm/VCMPSDZrm. This allows commuting any immediate value. The previous code only commuted equality immediates. This was inherited from an earlier version of VCMPSSZrm/VCMPSDZrm. --- llvm/lib/Target/X86/X86InstrAVX512.td | 14 ++++++-------- llvm/test/CodeGen/X86/select-of-half-constants.ll | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 9672e3835f469..34621b4e68dbf 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2652,23 +2652,21 @@ defm VCMPPD : avx512_vcmp, AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; defm VCMPPS : avx512_vcmp, AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; +defm VCMPPH : avx512_vcmp, + AVX512PSIi8Base, EVEX_4V, EVEX_CD8<16, CD8VF>, TA; // Patterns to select fp compares with load as first operand. let Predicates = [HasAVX512] in { - def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1, - timm:$cc)), + def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1, timm:$cc)), (VCMPSDZrm FR64X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; - def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1, - timm:$cc)), + def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1, timm:$cc)), (VCMPSSZrm FR32X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; } -defm VCMPPH : avx512_vcmp, - AVX512PSIi8Base, EVEX_4V, EVEX_CD8<16, CD8VF>, TA; let Predicates = [HasFP16] in { - def : Pat<(v1i1 (X86cmpms(loadf16 addr:$src2), FR16X:$src1, CommutableCMPCC:$cc)), - (VCMPSHZrm FR16X:$src1, addr:$src2, imm:$cc)>; + def : Pat<(v1i1 (X86cmpms (loadf16 addr:$src2), FR16X:$src1, timm:$cc)), + (VCMPSHZrm FR16X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; } // ---------------------------------------------------------------- diff --git a/llvm/test/CodeGen/X86/select-of-half-constants.ll b/llvm/test/CodeGen/X86/select-of-half-constants.ll index e74e083f8cfe5..e22d4c8b792dc 100644 --- a/llvm/test/CodeGen/X86/select-of-half-constants.ll +++ b/llvm/test/CodeGen/X86/select-of-half-constants.ll @@ -19,8 +19,8 @@ define half @fcmp_select_fp_constants_olt(half %x) nounwind readnone { define half @fcmp_select_fp_constants_ogt(half %x) nounwind readnone { ; X64-AVX512FP16-LABEL: fcmp_select_fp_constants_ogt: ; X64-AVX512FP16: # %bb.0: -; X64-AVX512FP16-NEXT: vcmpgtsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 ; X64-AVX512FP16-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-AVX512FP16-NEXT: vcmpgtsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1 ; X64-AVX512FP16-NEXT: vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-AVX512FP16-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} ; X64-AVX512FP16-NEXT: retq From ff95d2524ac7197cbac3a801680d60b863fc1399 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 15 Aug 2021 12:00:54 -0700 Subject: [PATCH 080/700] [X86] Prevent accidentally accepting cmpeqsh as a valid mnemonic. We should only accept as vcmpeqsh. Same for all the other 31 comparison values. --- llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp | 3 ++- llvm/test/MC/X86/avx512-err.s | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 6cd64ad2592a8..b9d8c148f5fbf 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -3184,7 +3184,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, .Case("gt_oq", 0x1E) .Case("true_us", 0x1F) .Default(~0U); - if (CC != ~0U && (IsVCMP || CC < 8)) { + if (CC != ~0U && (IsVCMP || CC < 8) && + (IsVCMP || PatchedName.back() != 'h')) { if (PatchedName.endswith("ss")) PatchedName = IsVCMP ? "vcmpss" : "cmpss"; else if (PatchedName.endswith("sd")) diff --git a/llvm/test/MC/X86/avx512-err.s b/llvm/test/MC/X86/avx512-err.s index 0d353a6c54981..96e8c267979f3 100644 --- a/llvm/test/MC/X86/avx512-err.s +++ b/llvm/test/MC/X86/avx512-err.s @@ -20,3 +20,6 @@ vpmuld %xmm1, %xmm2, %xmm3 // ERR: invalid instruction mnemonic 'maskmov' maskmov %mm1, %mm2 + +// ERR: invalid instruction mnemonic 'cmpeqsh' +cmpeqsh %xmm2, %xmm1, %k0 From 44d0a99a12ec7ead4d2f5ef649ba05b40f6d463d Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Sun, 15 Aug 2021 12:30:52 -0700 Subject: [PATCH 081/700] Add missing triple for test --- .../tools/llvm-dwarfdump/X86/verify_invalid_ref_multi_section.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_multi_section.s b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_multi_section.s index b1f94cfa65575..71a93fad6c190 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_multi_section.s +++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_multi_section.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc %s -o %t.o -filetype=obj +# RUN: llvm-mc -triple x86_64-pc-linux %s -o %t.o -filetype=obj # RUN: llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s # CHECK-NOT: error: From ca637014f148288ce59c16ee79ecce60c83af703 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 15 Aug 2021 14:14:33 -0400 Subject: [PATCH 082/700] [Analysis][SimplifyLibCalls] improve function signature check for memcmp This would assert/crash as shown in: https://llvm.org/PR50850 The matching for bcmp/bcopy should probably also be updated, but that's another patch. --- llvm/lib/Analysis/TargetLibraryInfo.cpp | 6 ++--- .../X86/simplify-libcalls-memcmp.ll | 22 +++++++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Transforms/InstCombine/X86/simplify-libcalls-memcmp.ll diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index cfd9aff505620..1e377df2a3f7d 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -922,9 +922,9 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy, case LibFunc_vec_malloc: return (NumParams == 1 && FTy.getReturnType()->isPointerTy()); case LibFunc_memcmp: - return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) && - FTy.getParamType(0)->isPointerTy() && - FTy.getParamType(1)->isPointerTy()); + return NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) && + FTy.getParamType(0)->isPointerTy() && + FTy.getParamType(1)->isPointerTy() && IsSizeTTy(FTy.getParamType(2)); case LibFunc_memchr: case LibFunc_memrchr: diff --git a/llvm/test/Transforms/InstCombine/X86/simplify-libcalls-memcmp.ll b/llvm/test/Transforms/InstCombine/X86/simplify-libcalls-memcmp.ll new file mode 100644 index 0000000000000..0bb82d8546521 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/X86/simplify-libcalls-memcmp.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S < %s -instcombine | FileCheck %s + +; This test requires a target with 'bcmp' in its library. +; It used to crash/assert because the function signature does +; not match the expected libcall signature for memcmp - +; the last arg should have type size_t -> i64 for this target. + +target triple = "x86_64-unknown-linux-gnu" + +@str = private unnamed_addr constant [6 x i8] c"abcde\00", align 1 + +declare i32 @memcmp(i8*, i8*, i32) + +define void @PR50850() { +; CHECK-LABEL: @PR50850( +; CHECK-NEXT: [[CALL:%.*]] = call i32 @memcmp(i8* bitcast (void ()* @PR50850 to i8*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0), i32 6) +; CHECK-NEXT: ret void +; + %call = call i32 @memcmp(i8* bitcast (void ()* @PR50850 to i8*), i8* bitcast ([6 x i8]* @str to i8*), i32 6) + ret void +} From 676f16bed5ebf1ef2ff1dbfe913f7f8a2e7fe2dd Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 15 Aug 2021 21:09:35 -0700 Subject: [PATCH 083/700] [docs] Change llvm-xray options to use the preferred double-dash forms --- llvm/docs/XRayExample.rst | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/llvm/docs/XRayExample.rst b/llvm/docs/XRayExample.rst index c0a2a7a917cfe..1ff66dcdff16b 100644 --- a/llvm/docs/XRayExample.rst +++ b/llvm/docs/XRayExample.rst @@ -78,7 +78,7 @@ been instrumented. We can see an example accounting with ``llvm-xray account``: :: - $ llvm-xray account xray-log.llc.m35qPB -top=10 -sort=sum -sortorder=dsc -instr_map ./bin/llc + $ llvm-xray account xray-log.llc.m35qPB --top=10 --sort=sum --sortorder=dsc --instr_map=./bin/llc Functions with latencies: 29 funcid count [ min, med, 90p, 99p, max] sum function 187 360 [ 0.000000, 0.000001, 0.000014, 0.000032, 0.000075] 0.001596 LLLexer.cpp:446:0: llvm::LLLexer::LexIdentifier() @@ -103,7 +103,7 @@ output for an example trace would look like the following: :: - $ llvm-xray convert -f yaml -symbolize -instr_map=./bin/llc xray-log.llc.m35qPB + $ llvm-xray convert -f yaml --symbolize --instr_map=./bin/llc xray-log.llc.m35qPB --- header: version: 1 @@ -151,7 +151,7 @@ function bodies to 1. We can do that with the $ XRAY_OPTIONS="patch_premain=true" ./bin/llc input.ll ==69819==XRay: Log file in 'xray-log.llc.5rqxkU' - $ llvm-xray account xray-log.llc.5rqxkU -top=10 -sort=sum -sortorder=dsc -instr_map ./bin/llc + $ llvm-xray account xray-log.llc.5rqxkU --top=10 --sort=sum --sortorder=dsc --instr_map=./bin/llc Functions with latencies: 36652 funcid count [ min, med, 90p, 99p, max] sum function 75 1 [ 0.672368, 0.672368, 0.672368, 0.672368, 0.672368] 0.672368 llc.cpp:271:0: main @@ -205,7 +205,7 @@ The way to use the command is to output the top stacks by call count and time sp :: - $ llvm-xray stack xray-log.llc.5rqxkU -instr_map ./bin/llc + $ llvm-xray stack xray-log.llc.5rqxkU --instr_map=./bin/llc Unique Stacks: 3069 Top 10 Stacks by leaf sum: @@ -227,9 +227,9 @@ In the default mode, identical stacks on different threads are independently aggregated. In a multithreaded program, you may end up having identical call stacks fill your list of top calls. -To address this, you may specify the ``-aggregate-threads`` or -``-per-thread-stacks`` flags. ``-per-thread-stacks`` treats the thread id as an -implicit root in each call stack tree, while ``-aggregate-threads`` combines +To address this, you may specify the ``--aggregate-threads`` or +``--per-thread-stacks`` flags. ``--per-thread-stacks`` treats the thread id as an +implicit root in each call stack tree, while ``--aggregate-threads`` combines identical stacks from all threads. Flame Graph Generation @@ -243,16 +243,16 @@ FlameGraph tool, currently available on `github To generate output for a flamegraph, a few more options are necessary. -- ``-all-stacks`` - Emits all of the stacks. -- ``-stack-format`` - Choose the flamegraph output format 'flame'. -- ``-aggregation-type`` - Choose the metric to graph. +- ``--all-stacks`` - Emits all of the stacks. +- ``--stack-format`` - Choose the flamegraph output format 'flame'. +- ``--aggregation-type`` - Choose the metric to graph. You may pipe the command output directly to the flamegraph tool to obtain an svg file. :: - $llvm-xray stack xray-log.llc.5rqxkU -instr_map ./bin/llc -stack-format=flame -aggregation-type=time -all-stacks | \ + $ llvm-xray stack xray-log.llc.5rqxkU --instr_map=./bin/llc --stack-format=flame --aggregation-type=time --all-stacks | \ /path/to/FlameGraph/flamegraph.pl > flamegraph.svg If you open the svg in a browser, mouse events allow exploring the call stacks. @@ -265,8 +265,8 @@ from the same generated trace: :: - $ llvm-xray convert -symbolize -instr_map=./bin/llc \ - -output-format=trace_event xray-log.llc.5rqxkU \ + $ llvm-xray convert --symbolize --instr_map=./bin/llc \ + --output-format=trace_event xray-log.llc.5rqxkU \ | gzip > llc-trace.txt.gz From a Chrome browser, navigating to ``chrome:///tracing`` allows us to load @@ -329,7 +329,7 @@ applications: :: - $ llvm-xray graph xray-log.sample.* -m sample -color-edges=sum -edge-label=sum \ + $ llvm-xray graph xray-log.sample.* -m sample --color-edges=sum --edge-label=sum \ | unflatten -f -l10 | dot -Tsvg -o sample.svg From 935a6d4024295201f1e3e59c995a9dfa986836d7 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 15 Aug 2021 21:19:04 -0700 Subject: [PATCH 084/700] [test] Change llvm-xray options to use the preferred double-dash forms and change -f= to -f --- .../X86/account-recursive-calls-only-tail-call-deduction.yaml | 4 ++-- .../tools/llvm-xray/X86/account-recursive-calls-only.yaml | 4 ++-- llvm/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt | 2 +- .../llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt | 2 +- .../llvm-xray/X86/convert-basic-log-version3-to-yaml.txt | 2 +- llvm/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt | 2 +- .../llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt | 2 +- .../tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt | 2 +- llvm/test/tools/llvm-xray/X86/convert-fdr-to-traceevent.txt | 2 +- llvm/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt | 2 +- llvm/test/tools/llvm-xray/X86/convert-roundtrip.yaml | 2 +- llvm/test/tools/llvm-xray/X86/convert-to-yaml.txt | 2 +- .../tools/llvm-xray/X86/convert-traceevent-special-events.txt | 2 +- llvm/test/tools/llvm-xray/X86/convert-with-debug-syms.txt | 2 +- .../tools/llvm-xray/X86/convert-with-standalone-instrmap.txt | 2 +- llvm/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt | 2 +- llvm/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt | 2 +- llvm/test/tools/llvm-xray/X86/fdr-dump-arg1.txt | 2 +- llvm/test/tools/llvm-xray/X86/stack-multithread.yaml | 4 ++-- 19 files changed, 22 insertions(+), 22 deletions(-) diff --git a/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml index 4f5d01b2cf9b1..2b08cc0f7e984 100644 --- a/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml +++ b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml @@ -1,5 +1,5 @@ -# RUN: llvm-xray account -d %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s -# RUN: llvm-xray account -d -recursive-calls-only %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s +# RUN: llvm-xray account -d %s -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s +# RUN: llvm-xray account -d --recursive-calls-only %s -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s --- header: diff --git a/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml index d7b36200d10d3..d3b2d11d52a38 100644 --- a/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml +++ b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml @@ -1,5 +1,5 @@ -# RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s -# RUN: llvm-xray account -recursive-calls-only %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s +# RUN: llvm-xray account %s -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s +# RUN: llvm-xray account --recursive-calls-only %s -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s --- header: diff --git a/llvm/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt index 52ec12550a3d3..1ac24e18949e3 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray convert %S/Inputs/naive-with-arg1-entries.xray -f=yaml -o - | FileCheck %s +; RUN: llvm-xray convert %S/Inputs/naive-with-arg1-entries.xray -f yaml -o - | FileCheck %s ; CHECK: --- ; CHECK-NEXT: header: diff --git a/llvm/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt index 84c757c2b2639..cdddbbc45c369 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray convert %S/Inputs/basic-log-arg1-version-3.xray -f=yaml -o - | FileCheck %s +; RUN: llvm-xray convert %S/Inputs/basic-log-arg1-version-3.xray -f yaml -o - | FileCheck %s ; CHECK: --- ; CHECK-NEXT: header: diff --git a/llvm/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt index d2af2fc09c2eb..a545932200a44 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray convert %S/Inputs/basic-log-version-3.xray -f=yaml -o - | FileCheck %s +; RUN: llvm-xray convert %S/Inputs/basic-log-version-3.xray -f yaml -o - | FileCheck %s ; CHECK: --- ; CHECK-NEXT: header: diff --git a/llvm/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt index 592796434bd83..6f4716b01ec3f 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray convert %S/Inputs/fdr-log-arg1.xray -f=yaml -o - | FileCheck %s +; RUN: llvm-xray convert %S/Inputs/fdr-log-arg1.xray -f yaml -o - | FileCheck %s ; CHECK: --- ; CHECK-NEXT: header: diff --git a/llvm/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt index afeac68fa3dac..594bf01945c70 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray convert %S/Inputs/fdr-log-arg1-version-3.xray -f=yaml -o - | FileCheck %s +; RUN: llvm-xray convert %S/Inputs/fdr-log-arg1-version-3.xray -f yaml -o - | FileCheck %s ; CHECK: --- ; CHECK-NEXT: header: diff --git a/llvm/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt index fc70015c41e87..53edc3a3c16ff 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray convert %S/Inputs/fdr-log-version-3.xray -f=yaml -o - | FileCheck %s +; RUN: llvm-xray convert %S/Inputs/fdr-log-version-3.xray -f yaml -o - | FileCheck %s ; CHECK: --- ; CHECK-NEXT: header: diff --git a/llvm/test/tools/llvm-xray/X86/convert-fdr-to-traceevent.txt b/llvm/test/tools/llvm-xray/X86/convert-fdr-to-traceevent.txt index 2e845ab3aa92a..99b4791ca5f44 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-fdr-to-traceevent.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-fdr-to-traceevent.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray convert %S/Inputs/fdr-log-version-1.xray -f=trace_event -o - \ +; RUN: llvm-xray convert %S/Inputs/fdr-log-version-1.xray -f trace_event -o - \ ; RUN: | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \ ; RUN: | FileCheck %s diff --git a/llvm/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt index 99bc7e11b97b8..cfe0126d9e256 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray convert %S/Inputs/fdr-log-version-1.xray -f=yaml -o - | FileCheck %s +; RUN: llvm-xray convert %S/Inputs/fdr-log-version-1.xray -f yaml -o - | FileCheck %s ; CHECK: --- ; CHECK-NEXT: header: diff --git a/llvm/test/tools/llvm-xray/X86/convert-roundtrip.yaml b/llvm/test/tools/llvm-xray/X86/convert-roundtrip.yaml index bbebd67e57611..44702ace87fc5 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-roundtrip.yaml +++ b/llvm/test/tools/llvm-xray/X86/convert-roundtrip.yaml @@ -1,4 +1,4 @@ -#RUN: llvm-xray convert %s -f=raw -o %t && llvm-xray convert %t -f=yaml -o - | FileCheck %s +#RUN: llvm-xray convert %s -f raw -o %t && llvm-xray convert %t -f yaml -o - | FileCheck %s --- header: version: 1 diff --git a/llvm/test/tools/llvm-xray/X86/convert-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-to-yaml.txt index f807fae3a64c5..93c22826294de 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-to-yaml.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-to-yaml.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray convert %S/Inputs/naive-log-simple.xray -f=yaml -o - | FileCheck %s +; RUN: llvm-xray convert %S/Inputs/naive-log-simple.xray -f yaml -o - | FileCheck %s ; CHECK: --- ; CHECK-NEXT: header: diff --git a/llvm/test/tools/llvm-xray/X86/convert-traceevent-special-events.txt b/llvm/test/tools/llvm-xray/X86/convert-traceevent-special-events.txt index 1693a4213a1b2..eb8fee7236847 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-traceevent-special-events.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-traceevent-special-events.txt @@ -1,4 +1,4 @@ -# RUN: llvm-xray convert %s -f=trace_event -o - \ +# RUN: llvm-xray convert %s -f trace_event -o - \ # RUN: | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \ # RUN: | FileCheck %s --- diff --git a/llvm/test/tools/llvm-xray/X86/convert-with-debug-syms.txt b/llvm/test/tools/llvm-xray/X86/convert-with-debug-syms.txt index dbb98e3d3cf05..dd601486b2fb1 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-with-debug-syms.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-with-debug-syms.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray convert -m %S/Inputs/elf64-sample-o2.bin -y %S/Inputs/naive-log-simple.xray -f=yaml -o - 2>&1 | FileCheck %s +; RUN: llvm-xray convert -m %S/Inputs/elf64-sample-o2.bin -y %S/Inputs/naive-log-simple.xray -f yaml -o - 2>&1 | FileCheck %s ; CHECK: --- ; CHECK-NEXT: header: diff --git a/llvm/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt b/llvm/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt index 9a1218256565e..5c8e8e4cb5ccf 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray convert -m %S/Inputs/elf64-objcopied-instrmap.bin -y %S/Inputs/naive-log-simple.xray -f=yaml -o - 2>&1 | FileCheck %s +; RUN: llvm-xray convert -m %S/Inputs/elf64-objcopied-instrmap.bin -y %S/Inputs/naive-log-simple.xray -f yaml -o - 2>&1 | FileCheck %s ; CHECK: --- ; CHECK-NEXT: header: diff --git a/llvm/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt b/llvm/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt index 1efcb3572bad8..d705ee25d2264 100644 --- a/llvm/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt +++ b/llvm/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray convert -m %S/Inputs/simple-xray-instrmap.yaml %S/Inputs/naive-log-simple.xray -f=yaml -o - | FileCheck %s +; RUN: llvm-xray convert -m %S/Inputs/simple-xray-instrmap.yaml %S/Inputs/naive-log-simple.xray -f yaml -o - | FileCheck %s ; CHECK: --- ; CHECK-NEXT: header: diff --git a/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt b/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt index ccb8a1b0538b7..7288c320c1b46 100644 --- a/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt +++ b/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray fdr-dump -verify %S/Inputs/fdr-log-arg1-version-3.xray \ +; RUN: llvm-xray fdr-dump --verify %S/Inputs/fdr-log-arg1-version-3.xray \ ; RUN: | FileCheck %s ; CHECK: [New Block] diff --git a/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1.txt b/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1.txt index 8fb381a170c32..f49bd3f7036c1 100644 --- a/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1.txt +++ b/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1.txt @@ -1,4 +1,4 @@ -; RUN: llvm-xray fdr-dump -verify %S/Inputs/fdr-log-arg1.xray | FileCheck %s +; RUN: llvm-xray fdr-dump --verify %S/Inputs/fdr-log-arg1.xray | FileCheck %s ; CHECK: [New Block] ; CHECK-NEXT: Preamble: diff --git a/llvm/test/tools/llvm-xray/X86/stack-multithread.yaml b/llvm/test/tools/llvm-xray/X86/stack-multithread.yaml index 95be7f770817b..ce8ffce62c24f 100644 --- a/llvm/test/tools/llvm-xray/X86/stack-multithread.yaml +++ b/llvm/test/tools/llvm-xray/X86/stack-multithread.yaml @@ -1,5 +1,5 @@ -#RUN: llvm-xray stack -per-thread-stacks %s | FileCheck %s --check-prefix PER-THREAD -#RUN: llvm-xray stack -aggregate-threads %s | FileCheck %s --check-prefix AGGREGATE +#RUN: llvm-xray stack --per-thread-stacks %s | FileCheck %s --check-prefix PER-THREAD +#RUN: llvm-xray stack --aggregate-threads %s | FileCheck %s --check-prefix AGGREGATE --- header: From b82ce77b2bf49ee3f032859535fe3f081cea0331 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 16 Aug 2021 12:31:12 +0800 Subject: [PATCH 085/700] [X86] Support avx512fp16 compare instructions in the IntelInstPrinter. This enables printing of the mnemonics that contain the predicate in the Intel printer. This requires accounting for the memory size that is explicitly printed in Intel syntax. Those changes have been synced to the ATT printer as well. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D108093 --- .../X86/MCTargetDesc/X86ATTInstPrinter.cpp | 32 ++++++--- .../X86/MCTargetDesc/X86IntelInstPrinter.cpp | 46 +++++++++--- llvm/test/MC/Disassembler/X86/avx512fp16.txt | 24 +++---- .../test/MC/Disassembler/X86/avx512fp16vl.txt | 20 +++--- llvm/test/MC/X86/intel-syntax-avx512fp16.s | 72 +++++++++---------- llvm/test/MC/X86/intel-syntax-avx512fp16vl.s | 60 ++++++++-------- 6 files changed, 147 insertions(+), 107 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp index ef0b5c1fe11c8..baacf2f46183f 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -176,11 +176,15 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI, if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { if (Desc.TSFlags & X86II::EVEX_B) { // Broadcast form. - // Load size is based on W-bit. - if (Desc.TSFlags & X86II::VEX_W) + // Load size is word for TA map. Otherwise it is based on W-bit. + if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) { + assert(!(Desc.TSFlags & X86II::VEX_W) && "Unknown W-bit value!"); + printwordmem(MI, CurOp--, OS); + } else if (Desc.TSFlags & X86II::VEX_W) { printqwordmem(MI, CurOp--, OS); - else + } else { printdwordmem(MI, CurOp--, OS); + } // Print the number of elements broadcasted. unsigned NumElts; @@ -190,20 +194,28 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI, NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8; else NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4; - if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) + if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) { + assert(!(Desc.TSFlags & X86II::VEX_W) && "Unknown W-bit value!"); NumElts *= 2; + } OS << "{1to" << NumElts << "}"; } else { - if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) - printdwordmem(MI, CurOp--, OS); - else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) + if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) { + if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) + printwordmem(MI, CurOp--, OS); + else + printdwordmem(MI, CurOp--, OS); + } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) { + assert((Desc.TSFlags & X86II::OpMapMask) != X86II::TA && + "Unexpected op map!"); printqwordmem(MI, CurOp--, OS); - else if (Desc.TSFlags & X86II::EVEX_L2) + } else if (Desc.TSFlags & X86II::EVEX_L2) { printzmmwordmem(MI, CurOp--, OS); - else if (Desc.TSFlags & X86II::VEX_L) + } else if (Desc.TSFlags & X86II::VEX_L) { printymmwordmem(MI, CurOp--, OS); - else + } else { printxmmwordmem(MI, CurOp--, OS); + } } } else { if (Desc.TSFlags & X86II::EVEX_B) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp index e88d3d06eddff..48c335f9a777c 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp @@ -132,6 +132,20 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS case X86::VCMPPSZrrib: case X86::VCMPPSZrribk: case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk: case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk: + case X86::VCMPPHZ128rmi: case X86::VCMPPHZ128rri: + case X86::VCMPPHZ256rmi: case X86::VCMPPHZ256rri: + case X86::VCMPPHZrmi: case X86::VCMPPHZrri: + case X86::VCMPSHZrm: case X86::VCMPSHZrr: + case X86::VCMPSHZrm_Int: case X86::VCMPSHZrr_Int: + case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik: + case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik: + case X86::VCMPPHZrmik: case X86::VCMPPHZrrik: + case X86::VCMPSHZrm_Intk: case X86::VCMPSHZrr_Intk: + case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik: + case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik: + case X86::VCMPPHZrmbi: case X86::VCMPPHZrmbik: + case X86::VCMPPHZrrib: case X86::VCMPPHZrribk: + case X86::VCMPSHZrrb_Int: case X86::VCMPSHZrrb_Intk: if (Imm >= 0 && Imm <= 31) { OS << '\t'; printCMPMnemonic(MI, /*IsVCMP*/true, OS); @@ -152,11 +166,15 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) { if (Desc.TSFlags & X86II::EVEX_B) { // Broadcast form. - // Load size is based on W-bit. - if (Desc.TSFlags & X86II::VEX_W) + // Load size is word for TA map. Otherwise it is based on W-bit. + if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) { + assert(!(Desc.TSFlags & X86II::VEX_W) && "Unknown W-bit value!"); + printwordmem(MI, CurOp++, OS); + } else if (Desc.TSFlags & X86II::VEX_W) { printqwordmem(MI, CurOp++, OS); - else + } else { printdwordmem(MI, CurOp++, OS); + } // Print the number of elements broadcasted. unsigned NumElts; @@ -166,18 +184,28 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8; else NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4; + if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) { + assert(!(Desc.TSFlags & X86II::VEX_W) && "Unknown W-bit value!"); + NumElts *= 2; + } OS << "{1to" << NumElts << "}"; } else { - if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) - printdwordmem(MI, CurOp++, OS); - else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) + if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) { + if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) + printwordmem(MI, CurOp++, OS); + else + printdwordmem(MI, CurOp++, OS); + } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) { + assert((Desc.TSFlags & X86II::OpMapMask) != X86II::TA && + "Unexpected op map!"); printqwordmem(MI, CurOp++, OS); - else if (Desc.TSFlags & X86II::EVEX_L2) + } else if (Desc.TSFlags & X86II::EVEX_L2) { printzmmwordmem(MI, CurOp++, OS); - else if (Desc.TSFlags & X86II::VEX_L) + } else if (Desc.TSFlags & X86II::VEX_L) { printymmwordmem(MI, CurOp++, OS); - else + } else { printxmmwordmem(MI, CurOp++, OS); + } } } else { printOperand(MI, CurOp++, OS); diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt index ea8db82f89a2b..6ba043ecd1be2 100644 --- a/llvm/test/MC/Disassembler/X86/avx512fp16.txt +++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt @@ -126,51 +126,51 @@ 0x62,0x65,0x16,0x87,0x58,0x72,0x80 # ATT: vcmpeqph %zmm28, %zmm29, %k5 -# INTEL: vcmpph k5, zmm29, zmm28, 0 +# INTEL: vcmpeqph k5, zmm29, zmm28 0x62,0x93,0x14,0x40,0xc2,0xec,0x00 # ATT: vcmpleph {sae}, %zmm28, %zmm29, %k5 -# INTEL: vcmpph k5, zmm29, zmm28, {sae}, 2 +# INTEL: vcmpleph k5, zmm29, zmm28, {sae} 0x62,0x93,0x14,0x10,0xc2,0xec,0x02 # ATT: vcmpneqph 268435456(%rbp,%r14,8), %zmm29, %k5 {%k7} -# INTEL: vcmpph k5 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456], 4 +# INTEL: vcmpneqph k5 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456] 0x62,0xb3,0x14,0x47,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x04 # ATT: vcmpnleph (%r9){1to32}, %zmm29, %k5 -# INTEL: vcmpph k5, zmm29, word ptr [r9]{1to32}, 6 +# INTEL: vcmpnleph k5, zmm29, word ptr [r9]{1to32} 0x62,0xd3,0x14,0x50,0xc2,0x29,0x06 # ATT: vcmpeq_uqph 8128(%rcx), %zmm29, %k5 -# INTEL: vcmpph k5, zmm29, zmmword ptr [rcx + 8128], 8 +# INTEL: vcmpeq_uqph k5, zmm29, zmmword ptr [rcx + 8128] 0x62,0xf3,0x14,0x40,0xc2,0x69,0x7f,0x08 # ATT: vcmpngtph -256(%rdx){1to32}, %zmm29, %k5 {%k7} -# INTEL: vcmpph k5 {k7}, zmm29, word ptr [rdx - 256]{1to32}, 10 +# INTEL: vcmpngtph k5 {k7}, zmm29, word ptr [rdx - 256]{1to32} 0x62,0xf3,0x14,0x57,0xc2,0x6a,0x80,0x0a # ATT: vcmpneq_oqsh %xmm28, %xmm29, %k5 -# INTEL: vcmpsh k5, xmm29, xmm28, 12 +# INTEL: vcmpneq_oqsh k5, xmm29, xmm28 0x62,0x93,0x16,0x00,0xc2,0xec,0x0c # ATT: vcmpgtsh {sae}, %xmm28, %xmm29, %k5 -# INTEL: vcmpsh k5, xmm29, xmm28, {sae}, 14 +# INTEL: vcmpgtsh k5, xmm29, xmm28, {sae} 0x62,0x93,0x16,0x10,0xc2,0xec,0x0e # ATT: vcmpeq_ossh 268435456(%rbp,%r14,8), %xmm29, %k5 {%k7} -# INTEL: vcmpsh k5 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456], 16 +# INTEL: vcmpeq_ossh k5 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456] 0x62,0xb3,0x16,0x07,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x10 # ATT: vcmple_oqsh (%r9), %xmm29, %k5 -# INTEL: vcmpsh k5, xmm29, word ptr [r9], 18 +# INTEL: vcmple_oqsh k5, xmm29, word ptr [r9] 0x62,0xd3,0x16,0x00,0xc2,0x29,0x12 # ATT: vcmpneq_ussh 254(%rcx), %xmm29, %k5 -# INTEL: vcmpsh k5, xmm29, word ptr [rcx + 254], 20 +# INTEL: vcmpneq_ussh k5, xmm29, word ptr [rcx + 254] 0x62,0xf3,0x16,0x00,0xc2,0x69,0x7f,0x14 # ATT: vcmpnle_uqsh -256(%rdx), %xmm29, %k5 {%k7} -# INTEL: vcmpsh k5 {k7}, xmm29, word ptr [rdx - 256], 22 +# INTEL: vcmpnle_uqsh k5 {k7}, xmm29, word ptr [rdx - 256] 0x62,0xf3,0x16,0x07,0xc2,0x6a,0x80,0x16 # ATT: vcomish %xmm29, %xmm30 diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt index 5f695c0bd3cef..362215492e1b3 100644 --- a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt +++ b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt @@ -42,43 +42,43 @@ 0x62,0xf5,0x54,0x9f,0x58,0x72,0x80 # ATT: vcmpltph %ymm4, %ymm5, %k5 -# INTEL: vcmpph k5, ymm5, ymm4, 1 +# INTEL: vcmpltph k5, ymm5, ymm4 0x62,0xf3,0x54,0x28,0xc2,0xec,0x01 # ATT: vcmpunordph %xmm4, %xmm5, %k5 -# INTEL: vcmpph k5, xmm5, xmm4, 3 +# INTEL: vcmpunordph k5, xmm5, xmm4 0x62,0xf3,0x54,0x08,0xc2,0xec,0x03 # ATT: vcmpnltph 268435456(%esp,%esi,8), %xmm5, %k5 {%k7} -# INTEL: vcmpph k5 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456], 5 +# INTEL: vcmpnltph k5 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456] 0x62,0xf3,0x54,0x0f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x05 # ATT: vcmpordph (%ecx){1to8}, %xmm5, %k5 -# INTEL: vcmpph k5, xmm5, word ptr [ecx]{1to8}, 7 +# INTEL: vcmpordph k5, xmm5, word ptr [ecx]{1to8} 0x62,0xf3,0x54,0x18,0xc2,0x29,0x07 # ATT: vcmpngeph 2032(%ecx), %xmm5, %k5 -# INTEL: vcmpph k5, xmm5, xmmword ptr [ecx + 2032], 9 +# INTEL: vcmpngeph k5, xmm5, xmmword ptr [ecx + 2032] 0x62,0xf3,0x54,0x08,0xc2,0x69,0x7f,0x09 # ATT: vcmpfalseph -256(%edx){1to8}, %xmm5, %k5 {%k7} -# INTEL: vcmpph k5 {k7}, xmm5, word ptr [edx - 256]{1to8}, 11 +# INTEL: vcmpfalseph k5 {k7}, xmm5, word ptr [edx - 256]{1to8} 0x62,0xf3,0x54,0x1f,0xc2,0x6a,0x80,0x0b # ATT: vcmpgeph 268435456(%esp,%esi,8), %ymm5, %k5 {%k7} -# INTEL: vcmpph k5 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456], 13 +# INTEL: vcmpgeph k5 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456] 0x62,0xf3,0x54,0x2f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x0d # ATT: vcmptrueph (%ecx){1to16}, %ymm5, %k5 -# INTEL: vcmpph k5, ymm5, word ptr [ecx]{1to16}, 15 +# INTEL: vcmptrueph k5, ymm5, word ptr [ecx]{1to16} 0x62,0xf3,0x54,0x38,0xc2,0x29,0x0f # ATT: vcmplt_oqph 4064(%ecx), %ymm5, %k5 -# INTEL: vcmpph k5, ymm5, ymmword ptr [ecx + 4064], 17 +# INTEL: vcmplt_oqph k5, ymm5, ymmword ptr [ecx + 4064] 0x62,0xf3,0x54,0x28,0xc2,0x69,0x7f,0x11 # ATT: vcmpunord_sph -256(%edx){1to16}, %ymm5, %k5 {%k7} -# INTEL: vcmpph k5 {k7}, ymm5, word ptr [edx - 256]{1to16}, 19 +# INTEL: vcmpunord_sph k5 {k7}, ymm5, word ptr [edx - 256]{1to16} 0x62,0xf3,0x54,0x3f,0xc2,0x6a,0x80,0x13 # ATT: vdivph %ymm4, %ymm5, %ymm6 diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s index ae6ab881efdfb..5d95bc82375a0 100644 --- a/llvm/test/MC/X86/intel-syntax-avx512fp16.s +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s @@ -124,53 +124,53 @@ // CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x58,0x72,0x80] vaddsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256] -// CHECK: vcmpph k5, zmm5, zmm4, 123 -// CHECK: encoding: [0x62,0xf3,0x54,0x48,0xc2,0xec,0x7b] - vcmpph k5, zmm5, zmm4, 123 +// CHECK: vcmpneq_usph k5, zmm5, zmm4 +// CHECK: encoding: [0x62,0xf3,0x54,0x48,0xc2,0xec,0x14] + vcmpneq_usph k5, zmm5, zmm4 -// CHECK: vcmpph k5, zmm5, zmm4, {sae}, 123 -// CHECK: encoding: [0x62,0xf3,0x54,0x18,0xc2,0xec,0x7b] - vcmpph k5, zmm5, zmm4, {sae}, 123 +// CHECK: vcmpnlt_uqph k5, zmm5, zmm4, {sae} +// CHECK: encoding: [0x62,0xf3,0x54,0x18,0xc2,0xec,0x15] + vcmpnlt_uqph k5, zmm5, zmm4, {sae} -// CHECK: vcmpph k5 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456], 123 -// CHECK: encoding: [0x62,0xf3,0x54,0x4f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] - vcmpph k5 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456], 123 +// CHECK: vcmpnle_uqph k5 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf3,0x54,0x4f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x16] + vcmpnle_uqph k5 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456] -// CHECK: vcmpph k5, zmm5, word ptr [ecx]{1to32}, 123 -// CHECK: encoding: [0x62,0xf3,0x54,0x58,0xc2,0x29,0x7b] - vcmpph k5, zmm5, word ptr [ecx]{1to32}, 123 +// CHECK: vcmpord_sph k5, zmm5, word ptr [ecx]{1to32} +// CHECK: encoding: [0x62,0xf3,0x54,0x58,0xc2,0x29,0x17] + vcmpord_sph k5, zmm5, word ptr [ecx]{1to32} -// CHECK: vcmpph k5, zmm5, zmmword ptr [ecx + 8128], 123 -// CHECK: encoding: [0x62,0xf3,0x54,0x48,0xc2,0x69,0x7f,0x7b] - vcmpph k5, zmm5, zmmword ptr [ecx + 8128], 123 +// CHECK: vcmpeq_usph k5, zmm5, zmmword ptr [ecx + 8128] +// CHECK: encoding: [0x62,0xf3,0x54,0x48,0xc2,0x69,0x7f,0x18] + vcmpeq_usph k5, zmm5, zmmword ptr [ecx + 8128] -// CHECK: vcmpph k5 {k7}, zmm5, word ptr [edx - 256]{1to32}, 123 -// CHECK: encoding: [0x62,0xf3,0x54,0x5f,0xc2,0x6a,0x80,0x7b] - vcmpph k5 {k7}, zmm5, word ptr [edx - 256]{1to32}, 123 +// CHECK: vcmpnge_uqph k5 {k7}, zmm5, word ptr [edx - 256]{1to32} +// CHECK: encoding: [0x62,0xf3,0x54,0x5f,0xc2,0x6a,0x80,0x19] + vcmpnge_uqph k5 {k7}, zmm5, word ptr [edx - 256]{1to32} -// CHECK: vcmpsh k5, xmm5, xmm4, 123 -// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0xec,0x7b] - vcmpsh k5, xmm5, xmm4, 123 +// CHECK: vcmpngt_uqsh k5, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0xec,0x1a] + vcmpngt_uqsh k5, xmm5, xmm4 -// CHECK: vcmpsh k5, xmm5, xmm4, {sae}, 123 -// CHECK: encoding: [0x62,0xf3,0x56,0x18,0xc2,0xec,0x7b] - vcmpsh k5, xmm5, xmm4, {sae}, 123 +// CHECK: vcmpfalse_ossh k5, xmm5, xmm4, {sae} +// CHECK: encoding: [0x62,0xf3,0x56,0x18,0xc2,0xec,0x1b] + vcmpfalse_ossh k5, xmm5, xmm4, {sae} -// CHECK: vcmpsh k5 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123 -// CHECK: encoding: [0x62,0xf3,0x56,0x0f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b] - vcmpsh k5 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123 +// CHECK: vcmpneq_ossh k5 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf3,0x56,0x0f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x1c] + vcmpneq_ossh k5 {k7}, xmm5, word ptr [esp + 8*esi + 268435456] -// CHECK: vcmpsh k5, xmm5, word ptr [ecx], 123 -// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0x29,0x7b] - vcmpsh k5, xmm5, word ptr [ecx], 123 +// CHECK: vcmpge_oqsh k5, xmm5, word ptr [ecx] +// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0x29,0x1d] + vcmpge_oqsh k5, xmm5, word ptr [ecx] -// CHECK: vcmpsh k5, xmm5, word ptr [ecx + 254], 123 -// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0x69,0x7f,0x7b] - vcmpsh k5, xmm5, word ptr [ecx + 254], 123 +// CHECK: vcmpgt_oqsh k5, xmm5, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0x69,0x7f,0x1e] + vcmpgt_oqsh k5, xmm5, word ptr [ecx + 254] -// CHECK: vcmpsh k5 {k7}, xmm5, word ptr [edx - 256], 123 -// CHECK: encoding: [0x62,0xf3,0x56,0x0f,0xc2,0x6a,0x80,0x7b] - vcmpsh k5 {k7}, xmm5, word ptr [edx - 256], 123 +// CHECK: vcmptrue_ussh k5 {k7}, xmm5, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf3,0x56,0x0f,0xc2,0x6a,0x80,0x1f] + vcmptrue_ussh k5 {k7}, xmm5, word ptr [edx - 256] // CHECK: vcomish xmm6, xmm5 // CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0xf5] diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s index 161208fdb452e..d6ccd32bbfc16 100644 --- a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s @@ -40,45 +40,45 @@ // CHECK: encoding: [0x62,0x65,0x14,0x97,0x58,0x72,0x80] vaddph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8} -// CHECK: vcmpph k5, ymm29, ymm28, 123 -// CHECK: encoding: [0x62,0x93,0x14,0x20,0xc2,0xec,0x7b] - vcmpph k5, ymm29, ymm28, 123 +// CHECK: vcmpeqph k5, ymm29, ymm28 +// CHECK: encoding: [0x62,0x93,0x14,0x20,0xc2,0xec,0x00] + vcmpph k5, ymm29, ymm28, 0 -// CHECK: vcmpph k5, xmm29, xmm28, 123 -// CHECK: encoding: [0x62,0x93,0x14,0x00,0xc2,0xec,0x7b] - vcmpph k5, xmm29, xmm28, 123 +// CHECK: vcmpltph k5, xmm29, xmm28 +// CHECK: encoding: [0x62,0x93,0x14,0x00,0xc2,0xec,0x01] + vcmpph k5, xmm29, xmm28, 1 -// CHECK: vcmpph k5 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456], 123 -// CHECK: encoding: [0x62,0xb3,0x14,0x07,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] - vcmpph k5 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: vcmpleph k5 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xb3,0x14,0x07,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x02] + vcmpph k5 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456], 2 -// CHECK: vcmpph k5, xmm29, word ptr [r9]{1to8}, 123 -// CHECK: encoding: [0x62,0xd3,0x14,0x10,0xc2,0x29,0x7b] - vcmpph k5, xmm29, word ptr [r9]{1to8}, 123 +// CHECK: vcmpunordph k5, xmm29, word ptr [r9]{1to8} +// CHECK: encoding: [0x62,0xd3,0x14,0x10,0xc2,0x29,0x03] + vcmpph k5, xmm29, word ptr [r9]{1to8}, 3 -// CHECK: vcmpph k5, xmm29, xmmword ptr [rcx + 2032], 123 -// CHECK: encoding: [0x62,0xf3,0x14,0x00,0xc2,0x69,0x7f,0x7b] - vcmpph k5, xmm29, xmmword ptr [rcx + 2032], 123 +// CHECK: vcmpneqph k5, xmm29, xmmword ptr [rcx + 2032] +// CHECK: encoding: [0x62,0xf3,0x14,0x00,0xc2,0x69,0x7f,0x04] + vcmpph k5, xmm29, xmmword ptr [rcx + 2032], 4 -// CHECK: vcmpph k5 {k7}, xmm29, word ptr [rdx - 256]{1to8}, 123 -// CHECK: encoding: [0x62,0xf3,0x14,0x17,0xc2,0x6a,0x80,0x7b] - vcmpph k5 {k7}, xmm29, word ptr [rdx - 256]{1to8}, 123 +// CHECK: vcmpnltph k5 {k7}, xmm29, word ptr [rdx - 256]{1to8} +// CHECK: encoding: [0x62,0xf3,0x14,0x17,0xc2,0x6a,0x80,0x05] + vcmpph k5 {k7}, xmm29, word ptr [rdx - 256]{1to8}, 5 -// CHECK: vcmpph k5 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456], 123 -// CHECK: encoding: [0x62,0xb3,0x14,0x27,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b] - vcmpph k5 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456], 123 +// CHECK: vcmpnleph k5 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xb3,0x14,0x27,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x06] + vcmpph k5 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456], 6 -// CHECK: vcmpph k5, ymm29, word ptr [r9]{1to16}, 123 -// CHECK: encoding: [0x62,0xd3,0x14,0x30,0xc2,0x29,0x7b] - vcmpph k5, ymm29, word ptr [r9]{1to16}, 123 +// CHECK: vcmpordph k5, ymm29, word ptr [r9]{1to16} +// CHECK: encoding: [0x62,0xd3,0x14,0x30,0xc2,0x29,0x07] + vcmpph k5, ymm29, word ptr [r9]{1to16}, 7 -// CHECK: vcmpph k5, ymm29, ymmword ptr [rcx + 4064], 123 -// CHECK: encoding: [0x62,0xf3,0x14,0x20,0xc2,0x69,0x7f,0x7b] - vcmpph k5, ymm29, ymmword ptr [rcx + 4064], 123 +// CHECK: vcmpeq_uqph k5, ymm29, ymmword ptr [rcx + 4064] +// CHECK: encoding: [0x62,0xf3,0x14,0x20,0xc2,0x69,0x7f,0x08] + vcmpph k5, ymm29, ymmword ptr [rcx + 4064], 8 -// CHECK: vcmpph k5 {k7}, ymm29, word ptr [rdx - 256]{1to16}, 123 -// CHECK: encoding: [0x62,0xf3,0x14,0x37,0xc2,0x6a,0x80,0x7b] - vcmpph k5 {k7}, ymm29, word ptr [rdx - 256]{1to16}, 123 +// CHECK: vcmpngeph k5 {k7}, ymm29, word ptr [rdx - 256]{1to16} +// CHECK: encoding: [0x62,0xf3,0x14,0x37,0xc2,0x6a,0x80,0x09] + vcmpph k5 {k7}, ymm29, word ptr [rdx - 256]{1to16}, 9 // CHECK: vdivph ymm30, ymm29, ymm28 // CHECK: encoding: [0x62,0x05,0x14,0x20,0x5e,0xf4] From 47d9d55c66601ffb5939ce07a7eb3c96122c48d6 Mon Sep 17 00:00:00 2001 From: Nathan Ridge Date: Wed, 28 Jul 2021 01:58:28 -0400 Subject: [PATCH 086/700] [clangd] Do not show inlay hints pertaining to code in other files Fixes https://github.com/clangd/clangd/issues/817 Differential Revision: https://reviews.llvm.org/D106934 --- clang-tools-extra/clangd/InlayHints.cpp | 5 ++++ .../clangd/unittests/InlayHintTests.cpp | 23 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp index 1283aa4dd62cc..7c3c6a2421d83 100644 --- a/clang-tools-extra/clangd/InlayHints.cpp +++ b/clang-tools-extra/clangd/InlayHints.cpp @@ -13,6 +13,7 @@ #include "clang/AST/ExprCXX.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/Basic/SourceManager.h" +#include "llvm/Support/raw_ostream.h" namespace clang { namespace clangd { @@ -314,6 +315,10 @@ class InlayHintVisitor : public RecursiveASTVisitor { toHalfOpenFileRange(AST.getSourceManager(), AST.getLangOpts(), R); if (!FileRange) return; + // The hint may be in a file other than the main file (for example, a header + // file that was included after the preamble), do not show in that case. + if (!AST.getSourceManager().isWrittenInMainFile(FileRange->getBegin())) + return; Results.push_back(InlayHint{ Range{ sourceLocToPosition(AST.getSourceManager(), FileRange->getBegin()), diff --git a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp index 1410ed115b6bf..6796a8ce70fff 100644 --- a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp +++ b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp @@ -9,6 +9,7 @@ #include "InlayHints.h" #include "Protocol.h" #include "TestTU.h" +#include "TestWorkspace.h" #include "XRefs.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -398,6 +399,28 @@ TEST(ParameterHints, SetterFunctions) { ExpectedHint{"timeout_millis: ", "timeout_millis"}); } +TEST(ParameterHints, IncludeAtNonGlobalScope) { + Annotations FooInc(R"cpp( + void bar() { foo(42); } + )cpp"); + Annotations FooCC(R"cpp( + struct S { + void foo(int param); + #include "foo.inc" + }; + )cpp"); + + TestWorkspace Workspace; + Workspace.addSource("foo.inc", FooInc.code()); + Workspace.addMainFile("foo.cc", FooCC.code()); + + auto AST = Workspace.openFile("foo.cc"); + ASSERT_TRUE(bool(AST)); + + // Ensure the hint for the call in foo.inc is NOT materialized in foo.cc. + EXPECT_EQ(hintsOfKind(*AST, InlayHintKind::ParameterHint).size(), 0u); +} + TEST(TypeHints, Smoke) { assertTypeHints(R"cpp( auto $waldo[[waldo]] = 42; From b8d451da8610f0dd3ab55289606d7f2973e708d6 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Mon, 16 Aug 2021 09:07:33 +0200 Subject: [PATCH 087/700] Add support of the future Debian (Debian 12 - Bookworm) https://wiki.debian.org/DebianBookworm ETA: 2023 --- clang/include/clang/Driver/Distro.h | 3 ++- clang/lib/Driver/Distro.cpp | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Driver/Distro.h b/clang/include/clang/Driver/Distro.h index 0d2a0939639ea..d9909bcf96968 100644 --- a/clang/include/clang/Driver/Distro.h +++ b/clang/include/clang/Driver/Distro.h @@ -37,6 +37,7 @@ class Distro { DebianStretch, DebianBuster, DebianBullseye, + DebianBookworm, Exherbo, RHEL5, RHEL6, @@ -119,7 +120,7 @@ class Distro { bool IsOpenSUSE() const { return DistroVal == OpenSUSE; } bool IsDebian() const { - return DistroVal >= DebianLenny && DistroVal <= DebianBullseye; + return DistroVal >= DebianLenny && DistroVal <= DebianBookworm; } bool IsUbuntu() const { diff --git a/clang/lib/Driver/Distro.cpp b/clang/lib/Driver/Distro.cpp index c4cf4e48b5b8d..cdb5a1725750f 100644 --- a/clang/lib/Driver/Distro.cpp +++ b/clang/lib/Driver/Distro.cpp @@ -150,6 +150,8 @@ static Distro::DistroType DetectDistro(llvm::vfs::FileSystem &VFS) { return Distro::DebianBuster; case 11: return Distro::DebianBullseye; + case 12: + return Distro::DebianBookworm; default: return Distro::UnknownDistro; } @@ -161,6 +163,7 @@ static Distro::DistroType DetectDistro(llvm::vfs::FileSystem &VFS) { .Case("stretch/sid", Distro::DebianStretch) .Case("buster/sid", Distro::DebianBuster) .Case("bullseye/sid", Distro::DebianBullseye) + .Case("bookworm/sid", Distro::DebianBookworm) .Default(Distro::UnknownDistro); } From 2848f6966ea35a1e8bdf1668ee4ae8fb0170235c Mon Sep 17 00:00:00 2001 From: Tres Popp Date: Tue, 10 Aug 2021 13:53:59 +0200 Subject: [PATCH 088/700] [mlir] Set top-down traversal for LinalgElementwiseOpFusion The primary pattern for this pass clones many operations from producers to consumers. Doing this top down prevents duplicated work when a producer has multiple consumers, if it also is consuming another linalg.generic. As an example, a chain of ~2600 generics that are fused into ~70 generics was resulting in 16255 pattern invocations. This took 14 seconds on one machine but takes only 0.3 seconds with top-down traversal. Differential Revision: https://reviews.llvm.org/D107818 --- mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp index fdca523b38544..43a4105e4c3f8 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @@ -1294,7 +1294,12 @@ struct LinalgElementwiseOpFusionPass patterns, LinalgElementwiseFusionOptions().setControlFoldingReshapes( allowFoldingUnitDimReshapes ? allowFoldingFn : skipUnitDimReshape)); - (void)applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns)); + + // Use TopDownTraversal for compile time reasons + GreedyRewriteConfig grc; + grc.useTopDownTraversal = true; + (void)applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns), + grc); } }; From 7185007735cd08a1928765f08301c48382c6222e Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Mon, 16 Aug 2021 09:43:26 +0200 Subject: [PATCH 089/700] sanitizer_common: fix format string in LibIgnore uptr should be printed with %zu. Differential Revision: https://reviews.llvm.org/D108106 --- compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp index 79f2f6a1517e5..caaba3155a7be 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp @@ -24,7 +24,7 @@ LibIgnore::LibIgnore(LinkerInitialized) { void LibIgnore::AddIgnoredLibrary(const char *name_templ) { Lock lock(&mutex_); if (count_ >= kMaxLibs) { - Report("%s: too many ignored libraries (max: %lu)\n", SanitizerToolName, + Report("%s: too many ignored libraries (max: %zu)\n", SanitizerToolName, kMaxLibs); Die(); } From 93c55d5ea24b8f455b0621bac373f142e0008739 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Thu, 10 Jun 2021 08:56:15 +0200 Subject: [PATCH 090/700] Reset all options in cl::ResetCommandLineParser() Reset cl::Positional, cl::Sink and cl::ConsumeAfter options as well in cl::ResetCommandLineParser(). Reviewed By: rriddle, sammccall Differential Revision: https://reviews.llvm.org/D103356 --- llvm/lib/Support/CommandLine.cpp | 12 +++++++-- llvm/unittests/Support/CommandLineTest.cpp | 30 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index 6c140863b13cc..e64934aa90cc8 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -1321,12 +1321,20 @@ bool cl::ParseCommandLineOptions(int argc, const char *const *argv, Errs, LongOptionsUseDoubleDash); } +/// Reset all options at least once, so that we can parse different options. void CommandLineParser::ResetAllOptionOccurrences() { - // So that we can parse different command lines multiple times in succession - // we reset all option values to look like they have never been seen before. + // Reset all option values to look like they have never been seen before. + // Options might be reset twice (they can be reference in both OptionsMap + // and one of the other members), but that does not harm. for (auto *SC : RegisteredSubCommands) { for (auto &O : SC->OptionsMap) O.second->reset(); + for (Option *O : SC->PositionalOpts) + O->reset(); + for (Option *O : SC->SinkOpts) + O->reset(); + if (SC->ConsumeAfterOpt) + SC->ConsumeAfterOpt->reset(); } } diff --git a/llvm/unittests/Support/CommandLineTest.cpp b/llvm/unittests/Support/CommandLineTest.cpp index a0352bc8a4c5e..d8fd6f6516cdd 100644 --- a/llvm/unittests/Support/CommandLineTest.cpp +++ b/llvm/unittests/Support/CommandLineTest.cpp @@ -1894,4 +1894,34 @@ TEST(CommandLineTest, ConsumeAfterTwoPositionals) { EXPECT_TRUE(Errs.empty()); } +TEST(CommandLineTest, ResetAllOptionOccurrences) { + cl::ResetCommandLineParser(); + + // -option [sink] input [args] + StackOption Option("option"); + StackOption> Sink(cl::Sink); + StackOption Input(cl::Positional); + StackOption> ExtraArgs(cl::ConsumeAfter); + + const char *Args[] = {"prog", "-option", "-unknown", "input", "-arg"}; + + std::string Errs; + raw_string_ostream OS(Errs); + EXPECT_TRUE(cl::ParseCommandLineOptions(5, Args, StringRef(), &OS)); + EXPECT_TRUE(OS.str().empty()); + + EXPECT_TRUE(Option); + EXPECT_EQ(1, (int)Sink.size()); + EXPECT_EQ("-unknown", Sink[0]); + EXPECT_EQ("input", Input); + EXPECT_EQ(1, (int)ExtraArgs.size()); + EXPECT_EQ("-arg", ExtraArgs[0]); + + cl::ResetAllOptionOccurrences(); + EXPECT_FALSE(Option); + EXPECT_EQ(0, (int)Sink.size()); + EXPECT_EQ(0, Input.getNumOccurrences()); + EXPECT_EQ(0, (int)ExtraArgs.size()); +} + } // anonymous namespace From 09507b53250dc266632c204558cb1c2b56e8ddea Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Mon, 16 Aug 2021 07:31:55 +0000 Subject: [PATCH 091/700] [AArch64][SME] Disable NEON in streaming mode In streaming mode most of the NEON instruction set is illegal, disable NEON when compiling with `+streaming-sve`, unless NEON is explictly requested. Subsequent patches will add support for the small subset of NEON instructions that are legal in streaming mode. Reviewed By: paulwalker-arm, david-arm Differential Revision: https://reviews.llvm.org/D107902 --- .../AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp | 11 ++++++++++- llvm/test/MC/AArch64/SME/streaming-sve-feature.s | 8 ++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 llvm/test/MC/AArch64/SME/streaming-sve-feature.s diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 3c2df1621e118..987cabce6cc98 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -57,7 +57,16 @@ createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { CPU = "apple-a12"; } - return createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS); + // Most of the NEON instruction set isn't supported in streaming mode on SME + // targets, disable NEON unless explicitly requested. + bool RequestedNEON = FS.contains("neon"); + bool RequestedStreamingSVE = FS.contains("streaming-sve"); + MCSubtargetInfo *STI = + createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS); + if (RequestedStreamingSVE && !RequestedNEON && + STI->hasFeature(AArch64::FeatureNEON)) + STI->ToggleFeature(AArch64::FeatureNEON); + return STI; } void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) { diff --git a/llvm/test/MC/AArch64/SME/streaming-sve-feature.s b/llvm/test/MC/AArch64/SME/streaming-sve-feature.s new file mode 100644 index 0000000000000..e35505ca39c58 --- /dev/null +++ b/llvm/test/MC/AArch64/SME/streaming-sve-feature.s @@ -0,0 +1,8 @@ +// RUN: llvm-mc -triple=aarch64 -mattr=+streaming-sve,+neon < %s 2>&1 | FileCheck %s +// RUN: not llvm-mc -triple=aarch64 -mattr=+streaming-sve < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR + +// Verify NEON is disabled when targeting streaming mode, if it's not +// explicitly requested. +add v0.8b, v1.8b, v2.8b +// CHECK: add v0.8b, v1.8b, v2.8b +// CHECK-ERROR: error: instruction requires: neon From 7142eb17fb3419a76c9ac4afce0df986ff08d61c Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 12 Aug 2021 15:43:09 +0200 Subject: [PATCH 092/700] sanitizers: compile with -O1 under debug Tsan's check_memcpy.c test was disabled under debug because it failed. But it points to real issues and does not help to just disable it. I tried to enable it and see what fail and the first hit was default ctor for: struct ChainedOriginDepotDesc { u32 here_id; u32 prev_id; }; initializing these fields to 0's help partially, but compiler still emits memset before calling ctor. I did not try to see what's the next failure, because if it fails on such small structs, it won't be realistic to fix everything and keep working. Compile runtimes with -O1 under debug instead. It seems to fix all current failures. At least I run check-tsan under clang/gcc x debug/non-debug and all combinations passed. -O1 does not usually use too aggressive optimizations and sometimes even makes debugging easier because machine code is not exceedingly verbose. Reviewed By: vitalybuka Differential Revision: https://reviews.llvm.org/D107962 --- compiler-rt/CMakeLists.txt | 2 +- compiler-rt/test/tsan/Linux/check_memcpy.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt index 83a68a4b04468..fc8a0cf6d46c5 100644 --- a/compiler-rt/CMakeLists.txt +++ b/compiler-rt/CMakeLists.txt @@ -378,7 +378,7 @@ if (NOT MSVC) # Build with optimization, unless we're in debug mode. if(COMPILER_RT_DEBUG) - list(APPEND SANITIZER_COMMON_CFLAGS -O0) + list(APPEND SANITIZER_COMMON_CFLAGS -O1) else() list(APPEND SANITIZER_COMMON_CFLAGS -O3) endif() diff --git a/compiler-rt/test/tsan/Linux/check_memcpy.c b/compiler-rt/test/tsan/Linux/check_memcpy.c index 75dd7da8316ba..55705ce8154c2 100644 --- a/compiler-rt/test/tsan/Linux/check_memcpy.c +++ b/compiler-rt/test/tsan/Linux/check_memcpy.c @@ -3,9 +3,7 @@ // its objdump. // RUN: %clang_tsan -O1 %s -o %t -// RUN: llvm-objdump -d %t | FileCheck %s - -// REQUIRES: compiler-rt-optimized +// RUN: llvm-objdump -d -l %t | FileCheck %s int main() { return 0; From 2eb554a9feafff5188d8b924908205c87d7f2fee Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Mon, 16 Aug 2021 10:53:15 +0300 Subject: [PATCH 093/700] Revert "Reland [SimplifyCFG] performBranchToCommonDestFolding(): form block-closed SSA form before cloning instructions (PR51125)" This is still wrong, as failing bots suggest. This reverts commit 3d9beefc7d713ad8462d92427ccd17b9532ce904. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 75 +++---------------- .../SimplifyCFG/fold-branch-to-common-dest.ll | 18 ++--- 2 files changed, 18 insertions(+), 75 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 68a0388398fc3..847fdd760d2fe 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1095,24 +1095,17 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses( // Update (liveout) uses of bonus instructions, // now that the bonus instruction has been cloned into predecessor. - // Note that we expect to be in a block-closed SSA form for this to work! + SSAUpdater SSAUpdate; + SSAUpdate.Initialize(BonusInst.getType(), + (NewBonusInst->getName() + ".merge").str()); + SSAUpdate.AddAvailableValue(BB, &BonusInst); + SSAUpdate.AddAvailableValue(PredBlock, NewBonusInst); for (Use &U : make_early_inc_range(BonusInst.uses())) { auto *UI = cast(U.getUser()); - auto *PN = dyn_cast(UI); - if (!PN) { - assert(UI->getParent() == BB && BonusInst.comesBefore(UI) && - "If the user is not a PHI node, then it should be in the same " - "block as, and come after, the original bonus instruction."); - continue; // Keep using the original bonus instruction. - } - // Is this the block-closed SSA form PHI node? - if (PN->getIncomingBlock(U) == BB) - continue; // Great, keep using the original bonus instruction. - // The only other alternative is an "use" when coming from - // the predecessor block - here we should refer to the cloned bonus instr. - assert(PN->getIncomingBlock(U) == PredBlock && - "Not in block-closed SSA form?"); - U.set(NewBonusInst); + if (UI->getParent() != PredBlock) + SSAUpdate.RewriteUseAfterInsertions(U); + else // Use is in the same block as, and comes before, NewBonusInst. + SSAUpdate.RewriteUse(U); } } } @@ -3039,56 +3032,6 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI, LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB); - // We want to duplicate all the bonus instructions in this block, - // and rewrite their uses, but in some cases with self-loops, - // the naive use rewrite approach won't work (will result in miscompilations). - // To avoid this problem, let's form block-closed SSA form. - for (Instruction &BonusInst : - reverse(iterator_range(*BB))) { - auto IsBCSSAUse = [BB, &BonusInst](Use &U) { - auto *UI = cast(U.getUser()); - if (auto *PN = dyn_cast(UI)) - return PN->getIncomingBlock(U) == BB; - return UI->getParent() == BB && BonusInst.comesBefore(UI); - }; - - // Does this instruction require rewriting of uses? - if (all_of(BonusInst.uses(), IsBCSSAUse)) - continue; - - SSAUpdater SSAUpdate; - Type *Ty = BonusInst.getType(); - SmallVector BCSSAPHIs; - SSAUpdate.Initialize(Ty, BonusInst.getName()); - - // Into each successor block of BB, insert a PHI node, that receives - // the BonusInst when coming from it's basic block, or poison otherwise. - for (BasicBlock *Succ : successors(BB)) { - // The block may have the same successor multiple times. Do it only once. - if (SSAUpdate.HasValueForBlock(Succ)) - continue; - BCSSAPHIs.emplace_back(PHINode::Create( - Ty, 0, BonusInst.getName() + ".bcssa", &Succ->front())); - PHINode *PN = BCSSAPHIs.back(); - for (BasicBlock *PredOfSucc : predecessors(Succ)) - PN->addIncoming(PredOfSucc == BB ? (Value *)&BonusInst - : PoisonValue::get(Ty), - PredOfSucc); - SSAUpdate.AddAvailableValue(Succ, PN); - } - - // And rewrite all uses that break block-closed SSA form. - for (Use &U : make_early_inc_range(BonusInst.uses())) - if (!IsBCSSAUse(U)) - SSAUpdate.RewriteUseAfterInsertions(U); - - // We might not have ended up needing PHI's in all of the succ blocks, - // drop the ones that are certainly unused, but don't bother otherwise. - for (PHINode *PN : BCSSAPHIs) - if (PN->use_empty()) - PN->eraseFromParent(); - } - IRBuilder<> Builder(PBI); // The builder is used to create instructions to eliminate the branch in BB. // If BB's terminator has !annotation metadata, add it to the new diff --git a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll index d948b61d65a03..2ff0418260771 100644 --- a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll +++ b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll @@ -834,7 +834,7 @@ define void @pr48450() { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[COUNTDOWN:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[DEC_BCSSA1:%.*]], [[FOR_BODYTHREAD_PRE_SPLIT:%.*]] ] +; CHECK-NEXT: [[COUNTDOWN:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[DEC_MERGE:%.*]], [[FOR_BODYTHREAD_PRE_SPLIT:%.*]] ] ; CHECK-NEXT: [[C:%.*]] = call i1 @gen1() ; CHECK-NEXT: br i1 [[C]], label [[FOR_INC:%.*]], label [[IF_THEN:%.*]] ; CHECK: for.inc: @@ -849,7 +849,7 @@ define void @pr48450() { ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[C2_NOT]], i1 true, i1 [[CMP_NOT]] ; CHECK-NEXT: br i1 [[OR_COND]], label [[IF_END_LOOPEXIT]], label [[FOR_BODYTHREAD_PRE_SPLIT]] ; CHECK: for.bodythread-pre-split: -; CHECK-NEXT: [[DEC_BCSSA1]] = phi i8 [ [[DEC_OLD]], [[FOR_INC]] ], [ [[DEC]], [[IF_THEN]] ] +; CHECK-NEXT: [[DEC_MERGE]] = phi i8 [ [[DEC]], [[IF_THEN]] ], [ [[DEC_OLD]], [[FOR_INC]] ] ; CHECK-NEXT: call void @sideeffect0() ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: if.end.loopexit: @@ -885,7 +885,7 @@ define void @pr48450_2(i1 %enable_loopback) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[COUNTDOWN:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[DEC_BCSSA1:%.*]], [[FOR_BODYTHREAD_PRE_SPLIT:%.*]] ] +; CHECK-NEXT: [[COUNTDOWN:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[DEC_MERGE:%.*]], [[FOR_BODYTHREAD_PRE_SPLIT:%.*]] ] ; CHECK-NEXT: [[C:%.*]] = call i1 @gen1() ; CHECK-NEXT: br i1 [[C]], label [[FOR_INC:%.*]], label [[IF_THEN:%.*]] ; CHECK: for.inc: @@ -900,7 +900,7 @@ define void @pr48450_2(i1 %enable_loopback) { ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[C2_NOT]], i1 true, i1 [[CMP_NOT]] ; CHECK-NEXT: br i1 [[OR_COND]], label [[IF_END_LOOPEXIT]], label [[FOR_BODYTHREAD_PRE_SPLIT]] ; CHECK: for.bodythread-pre-split: -; CHECK-NEXT: [[DEC_BCSSA1]] = phi i8 [ poison, [[FOR_BODYTHREAD_PRE_SPLIT_LOOPBACK:%.*]] ], [ [[DEC_OLD]], [[FOR_INC]] ], [ [[DEC]], [[IF_THEN]] ] +; CHECK-NEXT: [[DEC_MERGE]] = phi i8 [ [[DEC_OLD]], [[FOR_INC]] ], [ [[DEC_MERGE]], [[FOR_BODYTHREAD_PRE_SPLIT_LOOPBACK:%.*]] ], [ [[DEC]], [[IF_THEN]] ] ; CHECK-NEXT: [[SHOULD_LOOPBACK:%.*]] = phi i1 [ true, [[FOR_INC]] ], [ false, [[FOR_BODYTHREAD_PRE_SPLIT_LOOPBACK]] ], [ true, [[IF_THEN]] ] ; CHECK-NEXT: [[DO_LOOPBACK:%.*]] = and i1 [[SHOULD_LOOPBACK]], [[ENABLE_LOOPBACK:%.*]] ; CHECK-NEXT: call void @sideeffect0() @@ -1005,8 +1005,8 @@ define void @pr49510() { ; CHECK-NEXT: [[TOBOOL_OLD:%.*]] = icmp ne i16 [[DOTOLD]], 0 ; CHECK-NEXT: br i1 [[TOBOOL_OLD]], label [[LAND_RHS:%.*]], label [[FOR_END:%.*]] ; CHECK: land.rhs: -; CHECK-NEXT: [[DOTBCSSA:%.*]] = phi i16 [ [[DOTOLD]], [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[LAND_RHS]] ] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[DOTBCSSA]], 0 +; CHECK-NEXT: [[DOTMERGE:%.*]] = phi i16 [ [[TMP0:%.*]], [[LAND_RHS]] ], [ [[DOTOLD]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[DOTMERGE]], 0 ; CHECK-NEXT: [[TMP0]] = load i16, i16* @global_pr49510, align 1 ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i16 [[TMP0]], 0 ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP]], i1 [[TOBOOL]], i1 false @@ -1043,15 +1043,15 @@ define i32 @pr51125() { ; CHECK-NEXT: [[ISZERO_OLD:%.*]] = icmp eq i32 [[LD_OLD]], 0 ; CHECK-NEXT: br i1 [[ISZERO_OLD]], label [[EXIT:%.*]], label [[L2:%.*]] ; CHECK: L2: -; CHECK-NEXT: [[LD_BCSSA1:%.*]] = phi i32 [ [[LD_OLD]], [[ENTRY:%.*]] ], [ [[LD:%.*]], [[L2]] ] +; CHECK-NEXT: [[LD_MERGE:%.*]] = phi i32 [ [[LD:%.*]], [[L2]] ], [ [[LD_OLD]], [[ENTRY:%.*]] ] ; CHECK-NEXT: store i32 -1, i32* @global_pr51125, align 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[LD_BCSSA1]], -1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[LD_MERGE]], -1 ; CHECK-NEXT: [[LD]] = load i32, i32* @global_pr51125, align 4 ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[LD]], 0 ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP]], i1 true, i1 [[ISZERO]] ; CHECK-NEXT: br i1 [[OR_COND]], label [[EXIT]], label [[L2]] ; CHECK: exit: -; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[LD_BCSSA1]], [[L2]] ], [ [[LD_OLD]], [[ENTRY]] ] +; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[LD]], [[L2]] ], [ [[LD_OLD]], [[ENTRY]] ] ; CHECK-NEXT: ret i32 [[R]] ; entry: From f7347dfa03e55ca3837d05160b25a5e9fc1794f7 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 16 Aug 2021 08:58:37 +0100 Subject: [PATCH 094/700] Revert "[sanitizer] Define 32bit uptr as uint" This reverts commit 45138f788c9b3c4ac5d9ae4479841c411c15190e. It looks like this breaks building sanitizers on Darwin platforms on Green Dragon https://green.lab.llvm.org/green/job/clang-stage1-RA/23332/console FAILED: lib/sanitizer_common/CMakeFiles/RTSanitizerCommonSymbolizerNoHooks.ios.dir/sanitizer_stacktrace.cpp.o /Users/buildslave/jenkins/workspace/clang-stage1-RA@2/clang-build/./bin/clang++ -DHAVE_RPC_XDR_H=0 -I/Users/buildslave/jenkins/workspace/clang-stage1-RA@2/llvm-project/compiler-rt/lib/sanitizer_common/.. -Wall -std=c++14 -Wno-unused-parameter -O2 -g -DNDEBUG -arch armv7 -arch armv7s -arch arm64 -arch armv7k -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.15.sdk -stdlib=libc++ -miphoneos-version-min=9.0 -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -fPIC -fno-builtin -fno-exceptions -funwind-tables -fno-stack-protector -fno-sanitize=safe-stack -fvisibility=hidden -fno-lto -Wthread-safety -Wthread-safety-reference -Wthread-safety-beta -O3 -g -Wno-gnu -Wno-variadic-macros -Wno-c99-extensions -Wno-format-pedantic -nostdinc++ -Wno-format -fno-rtti -Wframe-larger-than=570 -Wglobal-constructors -DSANITIZER_SUPPORTS_WEAK_HOOKS=0 -MD -MT lib/sanitizer_common/CMakeFiles/RTSanitizerCommonSymbolizerNoHooks.ios.dir/sanitizer_stacktrace.cpp.o -MF lib/sanitizer_common/CMakeFiles/RTSanitizerCommonSymbolizerNoHooks.ios.dir/sanitizer_stacktrace.cpp.o.d -o lib/sanitizer_common/CMakeFiles/RTSanitizerCommonSymbolizerNoHooks.ios.dir/sanitizer_stacktrace.cpp.o -c '/Users/buildslave/jenkins/workspace/clang-stage1-RA@2/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp' In file included from /Users/buildslave/jenkins/workspace/clang-stage1-RA@2/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp:13: In file included from /Users/buildslave/jenkins/workspace/clang-stage1-RA@2/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h:15: /Users/buildslave/jenkins/workspace/clang-stage1-RA@2/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_common.h:1068:14: error: 'operator new' takes type size_t ('unsigned long') as first parameter inline void *operator new(__sanitizer::operator_new_size_type size, ^ 1 error generated. --- compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h | 5 ----- .../lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h index de06f42c3f48c..056b00a10e2be 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h @@ -139,13 +139,8 @@ namespace __sanitizer { typedef unsigned long long uptr; typedef signed long long sptr; #else -# if (SANITIZER_WORDSIZE == 64) typedef unsigned long uptr; typedef signed long sptr; -# else -typedef unsigned int uptr; -typedef signed int sptr; -# endif #endif // defined(_WIN64) #if defined(__x86_64__) // Since x32 uses ILP32 data model in 64-bit hardware mode, we must use diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp index 385b6158300ca..670e96552c68f 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp @@ -71,7 +71,7 @@ void Print(const set &s) { #if defined(_WIN64) fprintf(stderr, "%llu ", *it); #else - fprintf(stderr, "%zu ", *it); + fprintf(stderr, "%lu ", *it); #endif } fprintf(stderr, "\n"); From c97318996fc1dbc04da4a00f931943a5890b2dc2 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 5 Aug 2021 17:18:17 +0200 Subject: [PATCH 095/700] tsan: add new trace Add structures for the new trace format, functions that serialize and add events to the trace and trace replaying logic. Differential Revision: https://reviews.llvm.org/D107911 --- compiler-rt/lib/tsan/rtl/tsan_defs.h | 5 + compiler-rt/lib/tsan/rtl/tsan_rtl.cpp | 182 +++++++++++++++ compiler-rt/lib/tsan/rtl/tsan_rtl.h | 93 ++++++++ compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp | 219 ++++++++++++++++++ compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp | 2 + compiler-rt/lib/tsan/rtl/tsan_trace.h | 152 +++++++++++- .../lib/tsan/tests/unit/CMakeLists.txt | 1 + .../lib/tsan/tests/unit/tsan_trace_test.cpp | 215 +++++++++++++++++ 8 files changed, 868 insertions(+), 1 deletion(-) create mode 100644 compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp diff --git a/compiler-rt/lib/tsan/rtl/tsan_defs.h b/compiler-rt/lib/tsan/rtl/tsan_defs.h index 2146a2f40f7a3..fe0c1da31599b 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_defs.h +++ b/compiler-rt/lib/tsan/rtl/tsan_defs.h @@ -51,13 +51,18 @@ typedef __m128i m128; namespace __tsan { +constexpr uptr kByteBits = 8; + // Thread slot ID. enum class Sid : u8 {}; constexpr uptr kThreadSlotCount = 256; +constexpr Sid kFreeSid = static_cast(255); // Abstract time unit, vector clock element. enum class Epoch : u16 {}; +constexpr uptr kEpochBits = 14; constexpr Epoch kEpochZero = static_cast(0); +constexpr Epoch kEpochOver = static_cast(1 << kEpochBits); const int kClkBits = 42; const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1; diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp index ece43153d1a4b..6dc0791f53d0a 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp @@ -555,6 +555,188 @@ StackID CurrentStackId(ThreadState *thr, uptr pc) { return id; } +namespace v3 { + +ALWAYS_INLINE USED bool TryTraceMemoryAccess(ThreadState *thr, uptr pc, + uptr addr, uptr size, + AccessType typ) { + DCHECK(size == 1 || size == 2 || size == 4 || size == 8); + if (!kCollectHistory) + return true; + EventAccess *ev; + if (UNLIKELY(!TraceAcquire(thr, &ev))) + return false; + u64 size_log = size == 1 ? 0 : size == 2 ? 1 : size == 4 ? 2 : 3; + uptr pc_delta = pc - thr->trace_prev_pc + (1 << (EventAccess::kPCBits - 1)); + thr->trace_prev_pc = pc; + if (LIKELY(pc_delta < (1 << EventAccess::kPCBits))) { + ev->is_access = 1; + ev->is_read = !!(typ & kAccessRead); + ev->is_atomic = !!(typ & kAccessAtomic); + ev->size_log = size_log; + ev->pc_delta = pc_delta; + DCHECK_EQ(ev->pc_delta, pc_delta); + ev->addr = CompressAddr(addr); + TraceRelease(thr, ev); + return true; + } + auto *evex = reinterpret_cast(ev); + evex->is_access = 0; + evex->is_func = 0; + evex->type = EventType::kAccessExt; + evex->is_read = !!(typ & kAccessRead); + evex->is_atomic = !!(typ & kAccessAtomic); + evex->size_log = size_log; + evex->addr = CompressAddr(addr); + evex->pc = pc; + TraceRelease(thr, evex); + return true; +} + +ALWAYS_INLINE USED bool TryTraceMemoryAccessRange(ThreadState *thr, uptr pc, + uptr addr, uptr size, + AccessType typ) { + if (!kCollectHistory) + return true; + EventAccessRange *ev; + if (UNLIKELY(!TraceAcquire(thr, &ev))) + return false; + thr->trace_prev_pc = pc; + ev->is_access = 0; + ev->is_func = 0; + ev->type = EventType::kAccessRange; + ev->is_read = !!(typ & kAccessRead); + ev->is_free = !!(typ & kAccessFree); + ev->size_lo = size; + ev->pc = CompressAddr(pc); + ev->addr = CompressAddr(addr); + ev->size_hi = size >> EventAccessRange::kSizeLoBits; + TraceRelease(thr, ev); + return true; +} + +void TraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size, + AccessType typ) { + if (LIKELY(TryTraceMemoryAccessRange(thr, pc, addr, size, typ))) + return; + TraceSwitchPart(thr); + UNUSED bool res = TryTraceMemoryAccessRange(thr, pc, addr, size, typ); + DCHECK(res); +} + +void TraceFunc(ThreadState *thr, uptr pc) { + if (LIKELY(TryTraceFunc(thr, pc))) + return; + TraceSwitchPart(thr); + UNUSED bool res = TryTraceFunc(thr, pc); + DCHECK(res); +} + +void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr, + StackID stk) { + DCHECK(type == EventType::kLock || type == EventType::kRLock); + if (!kCollectHistory) + return; + EventLock ev; + ev.is_access = 0; + ev.is_func = 0; + ev.type = type; + ev.pc = CompressAddr(pc); + ev.stack_lo = stk; + ev.stack_hi = stk >> EventLock::kStackIDLoBits; + ev._ = 0; + ev.addr = CompressAddr(addr); + TraceEvent(thr, ev); +} + +void TraceMutexUnlock(ThreadState *thr, uptr addr) { + if (!kCollectHistory) + return; + EventUnlock ev; + ev.is_access = 0; + ev.is_func = 0; + ev.type = EventType::kUnlock; + ev._ = 0; + ev.addr = CompressAddr(addr); + TraceEvent(thr, ev); +} + +void TraceTime(ThreadState *thr) { + if (!kCollectHistory) + return; + EventTime ev; + ev.is_access = 0; + ev.is_func = 0; + ev.type = EventType::kTime; + ev.sid = static_cast(thr->sid); + ev.epoch = static_cast(thr->epoch); + ev._ = 0; + TraceEvent(thr, ev); +} + +NOINLINE +void TraceSwitchPart(ThreadState *thr) { + Trace *trace = &thr->tctx->trace; + Event *pos = reinterpret_cast(atomic_load_relaxed(&thr->trace_pos)); + DCHECK_EQ(reinterpret_cast(pos + 1) & TracePart::kAlignment, 0); + auto *part = trace->parts.Back(); + DPrintf("TraceSwitchPart part=%p pos=%p\n", part, pos); + if (part) { + // We can get here when we still have space in the current trace part. + // The fast-path check in TraceAcquire has false positives in the middle of + // the part. Check if we are indeed at the end of the current part or not, + // and fill any gaps with NopEvent's. + Event *end = &part->events[TracePart::kSize]; + DCHECK_GE(pos, &part->events[0]); + DCHECK_LE(pos, end); + if (pos + 1 < end) { + if ((reinterpret_cast(pos) & TracePart::kAlignment) == + TracePart::kAlignment) + *pos++ = NopEvent; + *pos++ = NopEvent; + DCHECK_LE(pos + 2, end); + atomic_store_relaxed(&thr->trace_pos, reinterpret_cast(pos)); + // Ensure we setup trace so that the next TraceAcquire + // won't detect trace part end. + Event *ev; + CHECK(TraceAcquire(thr, &ev)); + return; + } + // We are indeed at the end. + for (; pos < end; pos++) *pos = NopEvent; + } +#if !SANITIZER_GO + if (ctx->after_multithreaded_fork) { + // We just need to survive till exec. + CHECK(part); + atomic_store_relaxed(&thr->trace_pos, + reinterpret_cast(&part->events[0])); + return; + } +#endif + part = new (MmapOrDie(sizeof(TracePart), "TracePart")) TracePart(); + part->trace = trace; + thr->trace_prev_pc = 0; + { + Lock lock(&trace->mtx); + trace->parts.PushBack(part); + atomic_store_relaxed(&thr->trace_pos, + reinterpret_cast(&part->events[0])); + } + // Make this part self-sufficient by restoring the current stack + // and mutex set in the beginning of the trace. + TraceTime(thr); + for (uptr *pos = &thr->shadow_stack[0]; pos < thr->shadow_stack_pos; pos++) + CHECK(TryTraceFunc(thr, *pos)); + for (uptr i = 0; i < thr->mset.Size(); i++) { + MutexSet::Desc d = thr->mset.Get(i); + TraceMutexLock(thr, d.write ? EventType::kLock : EventType::kRLock, 0, + d.addr, d.stack_id); + } +} + +} // namespace v3 + void TraceSwitch(ThreadState *thr) { #if !SANITIZER_GO if (ctx->after_multithreaded_fork) diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.h b/compiler-rt/lib/tsan/rtl/tsan_rtl.h index 4d05b55ee40e7..1eb9b8c138237 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl.h +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.h @@ -444,6 +444,13 @@ struct ThreadState { const ReportDesc *current_report; + // Current position in tctx->trace.Back()->events (Event*). + atomic_uintptr_t trace_pos; + // PC of the last memory access, used to compute PC deltas in the trace. + uptr trace_prev_pc; + Sid sid; + Epoch epoch; + explicit ThreadState(Context *ctx, Tid tid, int unique_id, u64 epoch, unsigned reuse_count, uptr stk_addr, uptr stk_size, uptr tls_addr, uptr tls_size); @@ -486,6 +493,8 @@ class ThreadContext final : public ThreadContextBase { u64 epoch0; u64 epoch1; + v3::Trace trace; + // Override superclass callbacks. void OnDead() override; void OnJoined(void *arg) override; @@ -549,6 +558,8 @@ struct Context { ClockAlloc clock_alloc; Flags flags; + + Mutex slot_mtx; }; extern Context *ctx; // The one and the only global runtime context. @@ -892,6 +903,88 @@ void LazyInitialize(ThreadState *thr) { #endif } +namespace v3 { + +void TraceSwitchPart(ThreadState *thr); +bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr, + uptr size, AccessType typ, VarSizeStackTrace *pstk, + MutexSet *pmset, uptr *ptag); + +template +ALWAYS_INLINE WARN_UNUSED_RESULT bool TraceAcquire(ThreadState *thr, + EventT **ev) { + Event *pos = reinterpret_cast(atomic_load_relaxed(&thr->trace_pos)); +#if SANITIZER_DEBUG + // TraceSwitch acquires these mutexes, + // so we lock them here to detect deadlocks more reliably. + { Lock lock(&ctx->slot_mtx); } + { Lock lock(&thr->tctx->trace.mtx); } + TracePart *current = thr->tctx->trace.parts.Back(); + if (current) { + DCHECK_GE(pos, ¤t->events[0]); + DCHECK_LE(pos, ¤t->events[TracePart::kSize]); + } else { + DCHECK_EQ(pos, nullptr); + } +#endif + // TracePart is allocated with mmap and is at least 4K aligned. + // So the following check is a faster way to check for part end. + // It may have false positives in the middle of the trace, + // they are filtered out in TraceSwitch. + if (UNLIKELY(((uptr)(pos + 1) & TracePart::kAlignment) == 0)) + return false; + *ev = reinterpret_cast(pos); + return true; +} + +template +ALWAYS_INLINE void TraceRelease(ThreadState *thr, EventT *evp) { + DCHECK_LE(evp + 1, &thr->tctx->trace.parts.Back()->events[TracePart::kSize]); + atomic_store_relaxed(&thr->trace_pos, (uptr)(evp + 1)); +} + +template +void TraceEvent(ThreadState *thr, EventT ev) { + EventT *evp; + if (!TraceAcquire(thr, &evp)) { + TraceSwitchPart(thr); + UNUSED bool res = TraceAcquire(thr, &evp); + DCHECK(res); + } + *evp = ev; + TraceRelease(thr, evp); +} + +ALWAYS_INLINE WARN_UNUSED_RESULT bool TryTraceFunc(ThreadState *thr, + uptr pc = 0) { + if (!kCollectHistory) + return true; + EventFunc *ev; + if (UNLIKELY(!TraceAcquire(thr, &ev))) + return false; + ev->is_access = 0; + ev->is_func = 1; + ev->pc = pc; + TraceRelease(thr, ev); + return true; +} + +WARN_UNUSED_RESULT +bool TryTraceMemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size, + AccessType typ); +WARN_UNUSED_RESULT +bool TryTraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size, + AccessType typ); +void TraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size, + AccessType typ); +void TraceFunc(ThreadState *thr, uptr pc = 0); +void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr, + StackID stk); +void TraceMutexUnlock(ThreadState *thr, uptr addr); +void TraceTime(ThreadState *thr); + +} // namespace v3 + } // namespace __tsan #endif // TSAN_RTL_H diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp index 06d3fa1326dd5..49e867a63aa92 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp @@ -450,6 +450,225 @@ void RestoreStack(Tid tid, const u64 epoch, VarSizeStackTrace *stk, ExtractTagFromStack(stk, tag); } +namespace v3 { + +// Replays the trace up to last_pos position in the last part +// or up to the provided epoch/sid (whichever is earlier) +// and calls the provided function f for each event. +template +void TraceReplay(Trace *trace, TracePart *last, Event *last_pos, Sid sid, + Epoch epoch, Func f) { + TracePart *part = trace->parts.Front(); + Sid ev_sid = kFreeSid; + Epoch ev_epoch = kEpochOver; + for (;;) { + DCHECK_EQ(part->trace, trace); + // Note: an event can't start in the last element. + // Since an event can take up to 2 elements, + // we ensure we have at least 2 before adding an event. + Event *end = &part->events[TracePart::kSize - 1]; + if (part == last) + end = last_pos; + for (Event *evp = &part->events[0]; evp < end; evp++) { + Event *evp0 = evp; + if (!evp->is_access && !evp->is_func) { + switch (evp->type) { + case EventType::kTime: { + auto *ev = reinterpret_cast(evp); + ev_sid = static_cast(ev->sid); + ev_epoch = static_cast(ev->epoch); + if (ev_sid == sid && ev_epoch > epoch) + return; + break; + } + case EventType::kAccessExt: + FALLTHROUGH; + case EventType::kAccessRange: + FALLTHROUGH; + case EventType::kLock: + FALLTHROUGH; + case EventType::kRLock: + // These take 2 Event elements. + evp++; + break; + case EventType::kUnlock: + // This takes 1 Event element. + break; + } + } + CHECK_NE(ev_sid, kFreeSid); + CHECK_NE(ev_epoch, kEpochOver); + f(ev_sid, ev_epoch, evp0); + } + if (part == last) + return; + part = trace->parts.Next(part); + CHECK(part); + } + CHECK(0); +} + +static void RestoreStackMatch(VarSizeStackTrace *pstk, MutexSet *pmset, + Vector *stack, MutexSet *mset, uptr pc, + bool *found) { + DPrintf2(" MATCHED\n"); + *pmset = *mset; + stack->PushBack(pc); + pstk->Init(&(*stack)[0], stack->Size()); + stack->PopBack(); + *found = true; +} + +// Checks if addr1|size1 is fully contained in addr2|size2. +// We check for fully contained instread of just overlapping +// because a memory access is always traced once, but can be +// split into multiple accesses in the shadow. +static constexpr bool IsWithinAccess(uptr addr1, uptr size1, uptr addr2, + uptr size2) { + return addr1 >= addr2 && addr1 + size1 <= addr2 + size2; +} + +// Replays the trace of thread tid up to the target event identified +// by sid/epoch/addr/size/typ and restores and returns stack, mutex set +// and tag for that event. If there are multiple such events, it returns +// the last one. Returns false if the event is not present in the trace. +bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr, + uptr size, AccessType typ, VarSizeStackTrace *pstk, + MutexSet *pmset, uptr *ptag) { + // This function restores stack trace and mutex set for the thread/epoch. + // It does so by getting stack trace and mutex set at the beginning of + // trace part, and then replaying the trace till the given epoch. + DPrintf2("RestoreStack: tid=%u sid=%u@%u addr=0x%zx/%zu typ=%x\n", tid, sid, + epoch, addr, size, typ); + ctx->slot_mtx.CheckLocked(); // needed to prevent trace part recycling + ctx->thread_registry.CheckLocked(); + ThreadContext *tctx = + static_cast(ctx->thread_registry.GetThreadLocked(tid)); + Trace *trace = &tctx->trace; + // Snapshot first/last parts and the current position in the last part. + TracePart *first_part; + TracePart *last_part; + Event *last_pos; + { + Lock lock(&trace->mtx); + first_part = trace->parts.Front(); + if (!first_part) + return false; + last_part = trace->parts.Back(); + last_pos = trace->final_pos; + if (tctx->thr) + last_pos = (Event *)atomic_load_relaxed(&tctx->thr->trace_pos); + } + // Too large for stack. + alignas(MutexSet) static char mset_storage[sizeof(MutexSet)]; + MutexSet &mset = *new (mset_storage) MutexSet(); + Vector stack; + uptr prev_pc = 0; + bool found = false; + bool is_read = typ & kAccessRead; + bool is_atomic = typ & kAccessAtomic; + bool is_free = typ & kAccessFree; + TraceReplay( + trace, last_part, last_pos, sid, epoch, + [&](Sid ev_sid, Epoch ev_epoch, Event *evp) { + bool match = ev_sid == sid && ev_epoch == epoch; + if (evp->is_access) { + if (evp->is_func == 0 && evp->type == EventType::kAccessExt && + evp->_ == 0) // NopEvent + return; + auto *ev = reinterpret_cast(evp); + uptr ev_addr = RestoreAddr(ev->addr); + uptr ev_size = 1 << ev->size_log; + uptr ev_pc = + prev_pc + ev->pc_delta - (1 << (EventAccess::kPCBits - 1)); + prev_pc = ev_pc; + DPrintf2(" Access: pc=0x%zx addr=0x%llx/%llu type=%llu/%llu\n", + ev_pc, ev_addr, ev_size, ev->is_read, ev->is_atomic); + if (match && type == EventType::kAccessExt && + IsWithinAccess(addr, size, ev_addr, ev_size) && + is_read == ev->is_read && is_atomic == ev->is_atomic && !is_free) + RestoreStackMatch(pstk, pmset, &stack, &mset, ev_pc, &found); + return; + } + if (evp->is_func) { + auto *ev = reinterpret_cast(evp); + if (ev->pc) { + DPrintf2(" FuncEnter: pc=0x%zx\n", ev->pc); + stack.PushBack(ev->pc); + } else { + DPrintf2(" FuncExit\n"); + CHECK(stack.Size()); + stack.PopBack(); + } + return; + } + switch (evp->type) { + case EventType::kAccessExt: { + auto *ev = reinterpret_cast(evp); + uptr ev_addr = RestoreAddr(ev->addr); + uptr ev_size = 1 << ev->size_log; + prev_pc = ev->pc; + DPrintf2(" AccessExt: pc=0x%zx addr=0x%llx/%llu type=%llu/%llu\n", + ev->pc, ev_addr, ev_size, ev->is_read, ev->is_atomic); + if (match && type == EventType::kAccessExt && + IsWithinAccess(addr, size, ev_addr, ev_size) && + is_read == ev->is_read && is_atomic == ev->is_atomic && + !is_free) + RestoreStackMatch(pstk, pmset, &stack, &mset, ev->pc, &found); + break; + } + case EventType::kAccessRange: { + auto *ev = reinterpret_cast(evp); + uptr ev_addr = RestoreAddr(ev->addr); + uptr ev_size = + (ev->size_hi << EventAccessRange::kSizeLoBits) + ev->size_lo; + uptr ev_pc = RestoreAddr(ev->pc); + prev_pc = ev_pc; + DPrintf2(" Range: pc=0x%zx addr=0x%llx/%llu type=%llu/%llu\n", + ev_pc, ev_addr, ev_size, ev->is_read, ev->is_free); + if (match && type == EventType::kAccessExt && + IsWithinAccess(addr, size, ev_addr, ev_size) && + is_read == ev->is_read && !is_atomic && is_free == ev->is_free) + RestoreStackMatch(pstk, pmset, &stack, &mset, ev_pc, &found); + break; + } + case EventType::kLock: + FALLTHROUGH; + case EventType::kRLock: { + auto *ev = reinterpret_cast(evp); + bool is_write = ev->type == EventType::kLock; + uptr ev_addr = RestoreAddr(ev->addr); + uptr ev_pc = RestoreAddr(ev->pc); + StackID stack_id = + (ev->stack_hi << EventLock::kStackIDLoBits) + ev->stack_lo; + DPrintf2(" Lock: pc=0x%zx addr=0x%llx stack=%u write=%d\n", ev_pc, + ev_addr, stack_id, is_write); + mset.AddAddr(ev_addr, stack_id, is_write); + // Events with ev_pc == 0 are written to the beginning of trace + // part as initial mutex set (are not real). + if (match && type == EventType::kLock && addr == ev_addr && ev_pc) + RestoreStackMatch(pstk, pmset, &stack, &mset, ev_pc, &found); + break; + } + case EventType::kUnlock: { + auto *ev = reinterpret_cast(evp); + uptr ev_addr = RestoreAddr(ev->addr); + DPrintf2(" Unlock: addr=0x%llx\n", ev_addr); + mset.DelAddr(ev_addr); + break; + } + case EventType::kTime: + // TraceReplay already extracted sid/epoch from it, + // nothing else to do here. + break; + } + }); + ExtractTagFromStack(pstk, ptag); + return found; +} + +} // namespace v3 + static bool FindRacyStacks(const RacyStacks &hash) { for (uptr i = 0; i < ctx->racy_stacks.Size(); i++) { if (hash == ctx->racy_stacks[i]) { diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp index aefbda2f83720..32261f5ee685b 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp @@ -252,6 +252,8 @@ void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id, thr->tctx = (ThreadContext*)tr->GetThreadLocked(tid); tr->Unlock(); + while (!thr->tctx->trace.parts.Empty()) thr->tctx->trace.parts.PopBack(); + #if !SANITIZER_GO if (ctx->after_multithreaded_fork) { thr->ignore_interceptors++; diff --git a/compiler-rt/lib/tsan/rtl/tsan_trace.h b/compiler-rt/lib/tsan/rtl/tsan_trace.h index f5e0c407cda86..a771ad9f52fd3 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_trace.h +++ b/compiler-rt/lib/tsan/rtl/tsan_trace.h @@ -13,8 +13,9 @@ #define TSAN_TRACE_H #include "tsan_defs.h" -#include "tsan_stack_trace.h" +#include "tsan_ilist.h" #include "tsan_mutexset.h" +#include "tsan_stack_trace.h" namespace __tsan { @@ -67,6 +68,155 @@ struct Trace { Trace() : mtx(MutexTypeTrace) {} }; +namespace v3 { + +enum class EventType : u64 { + kAccessExt, + kAccessRange, + kLock, + kRLock, + kUnlock, + kTime, +}; + +// "Base" type for all events for type dispatch. +struct Event { + // We use variable-length type encoding to give more bits to some event + // types that need them. If is_access is set, this is EventAccess. + // Otherwise, if is_func is set, this is EventFunc. + // Otherwise type denotes the type. + u64 is_access : 1; + u64 is_func : 1; + EventType type : 3; + u64 _ : 59; +}; +static_assert(sizeof(Event) == 8, "bad Event size"); + +// Nop event used as padding and does not affect state during replay. +static constexpr Event NopEvent = {1, 0, EventType::kAccessExt, 0}; + +// Compressed memory access can represent only some events with PCs +// close enough to each other. Otherwise we fall back to EventAccessExt. +struct EventAccess { + static constexpr uptr kPCBits = 15; + + u64 is_access : 1; // = 1 + u64 is_read : 1; + u64 is_atomic : 1; + u64 size_log : 2; + u64 pc_delta : kPCBits; // signed delta from the previous memory access PC + u64 addr : kCompressedAddrBits; +}; +static_assert(sizeof(EventAccess) == 8, "bad EventAccess size"); + +// Function entry (pc != 0) or exit (pc == 0). +struct EventFunc { + u64 is_access : 1; // = 0 + u64 is_func : 1; // = 1 + u64 pc : 62; +}; +static_assert(sizeof(EventFunc) == 8, "bad EventFunc size"); + +// Extended memory access with full PC. +struct EventAccessExt { + u64 is_access : 1; // = 0 + u64 is_func : 1; // = 0 + EventType type : 3; // = EventType::kAccessExt + u64 is_read : 1; + u64 is_atomic : 1; + u64 size_log : 2; + u64 _ : 11; + u64 addr : kCompressedAddrBits; + u64 pc; +}; +static_assert(sizeof(EventAccessExt) == 16, "bad EventAccessExt size"); + +// Access to a memory range. +struct EventAccessRange { + static constexpr uptr kSizeLoBits = 13; + + u64 is_access : 1; // = 0 + u64 is_func : 1; // = 0 + EventType type : 3; // = EventType::kAccessRange + u64 is_read : 1; + u64 is_free : 1; + u64 size_lo : kSizeLoBits; + u64 pc : kCompressedAddrBits; + u64 addr : kCompressedAddrBits; + u64 size_hi : 64 - kCompressedAddrBits; +}; +static_assert(sizeof(EventAccessRange) == 16, "bad EventAccessRange size"); + +// Mutex lock. +struct EventLock { + static constexpr uptr kStackIDLoBits = 15; + + u64 is_access : 1; // = 0 + u64 is_func : 1; // = 0 + EventType type : 3; // = EventType::kLock or EventType::kRLock + u64 pc : kCompressedAddrBits; + u64 stack_lo : kStackIDLoBits; + u64 stack_hi : sizeof(StackID) * kByteBits - kStackIDLoBits; + u64 _ : 3; + u64 addr : kCompressedAddrBits; +}; +static_assert(sizeof(EventLock) == 16, "bad EventLock size"); + +// Mutex unlock. +struct EventUnlock { + u64 is_access : 1; // = 0 + u64 is_func : 1; // = 0 + EventType type : 3; // = EventType::kUnlock + u64 _ : 15; + u64 addr : kCompressedAddrBits; +}; +static_assert(sizeof(EventUnlock) == 8, "bad EventUnlock size"); + +// Time change event. +struct EventTime { + u64 is_access : 1; // = 0 + u64 is_func : 1; // = 0 + EventType type : 3; // = EventType::kTime + u64 sid : sizeof(Sid) * kByteBits; + u64 epoch : kEpochBits; + u64 _ : 64 - 5 - sizeof(Sid) * kByteBits - kEpochBits; +}; +static_assert(sizeof(EventTime) == 8, "bad EventTime size"); + +struct Trace; + +struct TraceHeader { + Trace* trace = nullptr; // back-pointer to Trace containing this part + INode trace_parts; // in Trace::parts +}; + +struct TracePart : TraceHeader { + static constexpr uptr kByteSize = 256 << 10; + static constexpr uptr kSize = + (kByteSize - sizeof(TraceHeader)) / sizeof(Event); + // TraceAcquire does a fast event pointer overflow check by comparing + // pointer into TracePart::events with kAlignment mask. Since TracePart's + // are allocated page-aligned, this check detects end of the array + // (it also have false positives in the middle that are filtered separately). + // This also requires events to be the last field. + static constexpr uptr kAlignment = 0xff0; + Event events[kSize]; + + TracePart() {} +}; +static_assert(sizeof(TracePart) == TracePart::kByteSize, "bad TracePart size"); + +struct Trace { + Mutex mtx; + IList parts; + Event* final_pos = + nullptr; // final position in the last part for finished threads + + Trace() : mtx(MutexTypeTrace) {} +}; + +} // namespace v3 + } // namespace __tsan #endif // TSAN_TRACE_H diff --git a/compiler-rt/lib/tsan/tests/unit/CMakeLists.txt b/compiler-rt/lib/tsan/tests/unit/CMakeLists.txt index 576aeda9ab0a7..ed614a26955e6 100644 --- a/compiler-rt/lib/tsan/tests/unit/CMakeLists.txt +++ b/compiler-rt/lib/tsan/tests/unit/CMakeLists.txt @@ -7,6 +7,7 @@ set(TSAN_UNIT_TEST_SOURCES tsan_shadow_test.cpp tsan_stack_test.cpp tsan_sync_test.cpp + tsan_trace_test.cpp tsan_unit_test_main.cpp tsan_vector_clock_test.cpp ) diff --git a/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp b/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp new file mode 100644 index 0000000000000..6e598323345ea --- /dev/null +++ b/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp @@ -0,0 +1,215 @@ +//===-- tsan_trace_test.cpp -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is a part of ThreadSanitizer (TSan), a race detector. +// +//===----------------------------------------------------------------------===// +#include "tsan_trace.h" + +#include + +#include "gtest/gtest.h" +#include "tsan_rtl.h" + +namespace __tsan { + +using namespace v3; + +// We need to run all trace tests in a new thread, +// so that the thread trace is empty initially. +static void run_in_thread(void *(*f)(void *), void *arg = nullptr) { + pthread_t th; + pthread_create(&th, nullptr, f, arg); + pthread_join(th, nullptr); +} + +TEST(Trace, RestoreAccess) { + struct Thread { + static void *Func(void *arg) { + // A basic test with some function entry/exit events, + // some mutex lock/unlock events and some other distracting + // memory events. + ThreadState *thr = cur_thread(); + TraceFunc(thr, 0x1000); + TraceFunc(thr, 0x1001); + TraceMutexLock(thr, v3::EventType::kLock, 0x4000, 0x5000, 0x6000); + TraceMutexLock(thr, v3::EventType::kLock, 0x4001, 0x5001, 0x6001); + TraceMutexUnlock(thr, 0x5000); + TraceFunc(thr); + CHECK(TryTraceMemoryAccess(thr, 0x2001, 0x3001, 8, kAccessRead)); + TraceMutexLock(thr, v3::EventType::kRLock, 0x4002, 0x5002, 0x6002); + TraceFunc(thr, 0x1002); + CHECK(TryTraceMemoryAccess(thr, 0x2000, 0x3000, 8, kAccessRead)); + // This is the access we want to find. + // The previous one is equivalent, but RestoreStack must prefer + // the last of the matchig accesses. + CHECK(TryTraceMemoryAccess(thr, 0x2002, 0x3000, 8, kAccessRead)); + Lock lock1(&ctx->slot_mtx); + ThreadRegistryLock lock2(&ctx->thread_registry); + VarSizeStackTrace stk; + MutexSet mset; + uptr tag = kExternalTagNone; + bool res = + RestoreStack(thr->tid, v3::EventType::kAccessExt, thr->sid, + thr->epoch, 0x3000, 8, kAccessRead, &stk, &mset, &tag); + CHECK(res); + CHECK_EQ(stk.size, 3); + CHECK_EQ(stk.trace[0], 0x1000); + CHECK_EQ(stk.trace[1], 0x1002); + CHECK_EQ(stk.trace[2], 0x2002); + CHECK_EQ(mset.Size(), 2); + CHECK_EQ(mset.Get(0).addr, 0x5001); + CHECK_EQ(mset.Get(0).stack_id, 0x6001); + CHECK_EQ(mset.Get(0).write, true); + CHECK_EQ(mset.Get(1).addr, 0x5002); + CHECK_EQ(mset.Get(1).stack_id, 0x6002); + CHECK_EQ(mset.Get(1).write, false); + CHECK_EQ(tag, kExternalTagNone); + return nullptr; + } + }; + run_in_thread(Thread::Func); +} + +TEST(Trace, MemoryAccessSize) { + struct Thread { + struct Params { + uptr access_size, offset, size; + bool res; + int type; + }; + static void *Func(void *arg) { + // Test tracing and matching of accesses of different sizes. + const Params *params = static_cast(arg); + Printf("access_size=%zu, offset=%zu, size=%zu, res=%d, type=%d\n", + params->access_size, params->offset, params->size, params->res, + params->type); + ThreadState *thr = cur_thread(); + TraceFunc(thr, 0x1000); + switch (params->type) { + case 0: + // This should emit compressed event. + CHECK(TryTraceMemoryAccess(thr, 0x2000, 0x3000, params->access_size, + kAccessRead)); + break; + case 1: + // This should emit full event. + CHECK(TryTraceMemoryAccess(thr, 0x2000000, 0x3000, + params->access_size, kAccessRead)); + break; + case 2: + TraceMemoryAccessRange(thr, 0x2000000, 0x3000, params->access_size, + kAccessRead); + break; + } + Lock lock1(&ctx->slot_mtx); + ThreadRegistryLock lock2(&ctx->thread_registry); + VarSizeStackTrace stk; + MutexSet mset; + uptr tag = kExternalTagNone; + bool res = RestoreStack(thr->tid, v3::EventType::kAccessExt, thr->sid, + thr->epoch, 0x3000 + params->offset, params->size, + kAccessRead, &stk, &mset, &tag); + CHECK_EQ(res, params->res); + if (params->res) { + CHECK_EQ(stk.size, 2); + CHECK_EQ(stk.trace[0], 0x1000); + CHECK_EQ(stk.trace[1], params->type ? 0x2000000 : 0x2000); + } + return nullptr; + } + }; + Thread::Params tests[] = { + {1, 0, 1, true}, {4, 0, 2, true}, + {4, 2, 2, true}, {8, 3, 1, true}, + {2, 1, 1, true}, {1, 1, 1, false}, + {8, 5, 4, false}, {4, static_cast(-1l), 4, false}, + }; + for (auto params : tests) { + for (params.type = 0; params.type < 3; params.type++) + run_in_thread(Thread::Func, ¶ms); + } +} + +TEST(Trace, RestoreMutexLock) { + struct Thread { + static void *Func(void *arg) { + // Check of restoration of a mutex lock event. + ThreadState *thr = cur_thread(); + TraceFunc(thr, 0x1000); + TraceMutexLock(thr, v3::EventType::kLock, 0x4000, 0x5000, 0x6000); + TraceMutexLock(thr, v3::EventType::kRLock, 0x4001, 0x5001, 0x6001); + TraceMutexLock(thr, v3::EventType::kRLock, 0x4002, 0x5001, 0x6002); + Lock lock1(&ctx->slot_mtx); + ThreadRegistryLock lock2(&ctx->thread_registry); + VarSizeStackTrace stk; + MutexSet mset; + uptr tag = kExternalTagNone; + bool res = RestoreStack(thr->tid, v3::EventType::kLock, thr->sid, + thr->epoch, 0x5001, 0, 0, &stk, &mset, &tag); + CHECK_EQ(stk.size, 2); + CHECK_EQ(stk.trace[0], 0x1000); + CHECK_EQ(stk.trace[1], 0x4002); + CHECK_EQ(mset.Size(), 2); + CHECK_EQ(mset.Get(0).addr, 0x5000); + CHECK_EQ(mset.Get(0).stack_id, 0x6000); + CHECK_EQ(mset.Get(0).write, true); + CHECK_EQ(mset.Get(1).addr, 0x5001); + CHECK_EQ(mset.Get(1).stack_id, 0x6001); + CHECK_EQ(mset.Get(1).write, false); + return nullptr; + } + }; + run_in_thread(Thread::Func); +} + +TEST(Trace, MultiPart) { + struct Thread { + static void *Func(void *arg) { + // Check replay of a trace with multiple parts. + ThreadState *thr = cur_thread(); + TraceFunc(thr, 0x1000); + TraceFunc(thr, 0x2000); + TraceMutexLock(thr, v3::EventType::kLock, 0x4000, 0x5000, 0x6000); + const uptr kEvents = 3 * sizeof(TracePart) / sizeof(v3::Event); + for (uptr i = 0; i < kEvents; i++) { + TraceFunc(thr, 0x3000); + TraceMutexLock(thr, v3::EventType::kLock, 0x4002, 0x5002, 0x6002); + TraceMutexUnlock(thr, 0x5002); + TraceFunc(thr); + } + TraceFunc(thr, 0x4000); + TraceMutexLock(thr, v3::EventType::kRLock, 0x4001, 0x5001, 0x6001); + CHECK(TryTraceMemoryAccess(thr, 0x2002, 0x3000, 8, kAccessRead)); + Lock lock1(&ctx->slot_mtx); + ThreadRegistryLock lock2(&ctx->thread_registry); + VarSizeStackTrace stk; + MutexSet mset; + uptr tag = kExternalTagNone; + bool res = + RestoreStack(thr->tid, v3::EventType::kAccessExt, thr->sid, + thr->epoch, 0x3000, 8, kAccessRead, &stk, &mset, &tag); + CHECK_EQ(stk.size, 4); + CHECK_EQ(stk.trace[0], 0x1000); + CHECK_EQ(stk.trace[1], 0x2000); + CHECK_EQ(stk.trace[2], 0x4000); + CHECK_EQ(stk.trace[3], 0x2002); + CHECK_EQ(mset.Size(), 2); + CHECK_EQ(mset.Get(0).addr, 0x5000); + CHECK_EQ(mset.Get(0).stack_id, 0x6000); + CHECK_EQ(mset.Get(0).write, true); + CHECK_EQ(mset.Get(1).addr, 0x5001); + CHECK_EQ(mset.Get(1).stack_id, 0x6001); + CHECK_EQ(mset.Get(1).write, false); + return nullptr; + } + }; + run_in_thread(Thread::Func); +} + +} // namespace __tsan From 96d5a501c5d9dac2190cd3f6c812860903021982 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 13 Aug 2021 10:39:03 +0100 Subject: [PATCH 096/700] [LoopUnroll] Add peeling tests with unreachable exits. --- .../peel-multiple-unreachable-exits.ll | 165 ++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 llvm/test/Transforms/LoopUnroll/peel-multiple-unreachable-exits.ll diff --git a/llvm/test/Transforms/LoopUnroll/peel-multiple-unreachable-exits.ll b/llvm/test/Transforms/LoopUnroll/peel-multiple-unreachable-exits.ll new file mode 100644 index 0000000000000..0a562900405cd --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/peel-multiple-unreachable-exits.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-unroll -S %s | FileCheck %s + +declare void @foo() + +define void @unroll_unreachable_exit_and_latch_exit(i32* %ptr, i32 %N, i32 %x) { +; CHECK-LABEL: @unroll_unreachable_exit_and_latch_exit( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], 2 +; CHECK-NEXT: br i1 [[C]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: else: +; CHECK-NEXT: [[C_2:%.*]] = icmp eq i32 [[IV]], [[X:%.*]] +; CHECK-NEXT: br i1 [[C_2]], label [[UNREACHABLE_EXIT:%.*]], label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[M:%.*]] = phi i32 [ 0, [[THEN]] ], [ [[X]], [[ELSE]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[IV]] +; CHECK-NEXT: store i32 [[M]], i32* [[GEP]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[C_3:%.*]] = icmp ult i32 [[IV]], 1000 +; CHECK-NEXT: br i1 [[C_3]], label [[LOOP_HEADER]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: unreachable.exit: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: unreachable +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ] + %c = icmp ult i32 %iv, 2 + br i1 %c, label %then, label %else + +then: + br label %loop.latch + +else: + %c.2 = icmp eq i32 %iv, %x + br i1 %c.2, label %unreachable.exit, label %loop.latch + +loop.latch: + %m = phi i32 [ 0, %then ], [ %x, %else ] + %gep = getelementptr i32, i32* %ptr, i32 %iv + store i32 %m, i32* %gep + %iv.next = add nuw nsw i32 %iv, 1 + %c.3 = icmp ult i32 %iv, 1000 + br i1 %c.3, label %loop.header, label %exit + +exit: + ret void + +unreachable.exit: + call void @foo() + unreachable +} + +define void @unroll_unreachable_exit_and_header_exit(i32* %ptr, i32 %N, i32 %x) { +; CHECK-LABEL: @unroll_unreachable_exit_and_header_exit( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[ELSE:%.*]] +; CHECK: else: +; CHECK-NEXT: [[C_2:%.*]] = icmp eq i32 1, [[X:%.*]] +; CHECK-NEXT: br i1 [[C_2]], label [[UNREACHABLE_EXIT:%.*]], label [[LOOP_LATCH:%.*]] +; CHECK: loop.latch: +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 1 +; CHECK-NEXT: store i32 [[X]], i32* [[GEP]], align 4 +; CHECK-NEXT: unreachable +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: unreachable.exit: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: unreachable +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ] + %c = icmp ult i32 %iv, 1000 + br i1 %c, label %exit, label %else + +else: + %c.2 = icmp eq i32 %iv, %x + br i1 %c.2, label %unreachable.exit, label %loop.latch + +loop.latch: + %gep = getelementptr i32, i32* %ptr, i32 %iv + store i32 %x, i32* %gep + %iv.next = add nuw nsw i32 %iv, 1 + br label %loop.header + +exit: + ret void + +unreachable.exit: + call void @foo() + unreachable +} + +define void @unroll_unreachable_and_multiple_reachable_exits(i32* %ptr, i32 %N, i32 %x) { +; CHECK-LABEL: @unroll_unreachable_and_multiple_reachable_exits( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], 2 +; CHECK-NEXT: br i1 [[C]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[C_2:%.*]] = icmp sgt i32 [[IV]], [[X:%.*]] +; CHECK-NEXT: br i1 [[C_2]], label [[EXIT:%.*]], label [[LOOP_LATCH]] +; CHECK: else: +; CHECK-NEXT: [[C_3:%.*]] = icmp eq i32 [[IV]], [[X]] +; CHECK-NEXT: br i1 [[C_3]], label [[UNREACHABLE_EXIT:%.*]], label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[M:%.*]] = phi i32 [ 0, [[THEN]] ], [ [[X]], [[ELSE]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[IV]] +; CHECK-NEXT: store i32 [[M]], i32* [[GEP]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[C_4:%.*]] = icmp ult i32 [[IV]], 1000 +; CHECK-NEXT: br i1 [[C_4]], label [[LOOP_HEADER]], label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: unreachable.exit: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: unreachable +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ] + %c = icmp ult i32 %iv, 2 + br i1 %c, label %then, label %else + +then: + %c.2 = icmp sgt i32 %iv, %x + br i1 %c.2, label %exit, label %loop.latch + +else: + %c.3 = icmp eq i32 %iv, %x + br i1 %c.3, label %unreachable.exit, label %loop.latch + +loop.latch: + %m = phi i32 [ 0, %then ], [ %x, %else ] + %gep = getelementptr i32, i32* %ptr, i32 %iv + store i32 %m, i32* %gep + %iv.next = add nuw nsw i32 %iv, 1 + %c.4 = icmp ult i32 %iv, 1000 + br i1 %c.4, label %loop.header, label %exit + +exit: + ret void + +unreachable.exit: + call void @foo() + unreachable +} From 39cc0b8c68b8d316954ecfac0d1f8498ea42866c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 13 Aug 2021 11:23:58 +0100 Subject: [PATCH 097/700] [PhaseOrdering] Add test for missed vectorization with vector::at calls. This test illustrates missed vectorization of loops with multiple std::vector::at calls, like int sum(std::vector *A, std::vector *B, int N) { int cost = 0; for (int i = 0; i < N; ++i) cost += A->at(i) + B->at(i); return cost; } https://clang.godbolt.org/z/KbYoaPhvq --- ...ple-unreachable-exits-for-vectorization.ll | 196 ++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll new file mode 100644 index 0000000000000..c24affbeaad12 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll @@ -0,0 +1,196 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -O2 -mtriple=arm64-apple-ios -S %s | FileCheck %s + +%vec = type { i64*, i64* } + +; Test to ensure a loop with multiple loads guarded by runtime-checks (like +; from multiple calls to C++'s std::vector::at) can be vectorized after +; hoisting the runtime checks out of the loop. + +define i64 @sum_2_at_with_int_conversion(%vec* %A, %vec* %B, i64 %N) { +; CHECK-LABEL: @sum_2_at_with_int_conversion( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP_START_I:%.*]] = getelementptr [[VEC:%.*]], %vec* [[A:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[START_I:%.*]] = load i64*, i64** [[GEP_START_I]], align 8 +; CHECK-NEXT: [[GEP_END_I:%.*]] = getelementptr [[VEC]], %vec* [[A]], i64 0, i32 1 +; CHECK-NEXT: [[END_I:%.*]] = load i64*, i64** [[GEP_END_I]], align 8 +; CHECK-NEXT: [[START_INT_I:%.*]] = ptrtoint i64* [[START_I]] to i64 +; CHECK-NEXT: [[END_INT_I:%.*]] = ptrtoint i64* [[END_I]] to i64 +; CHECK-NEXT: [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]] +; CHECK-NEXT: [[GEP_START_I1:%.*]] = getelementptr [[VEC]], %vec* [[B:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[GEP_END_I3:%.*]] = getelementptr [[VEC]], %vec* [[B]], i64 0, i32 1 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT12:%.*]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT12]] ] +; CHECK-NEXT: [[INRANGE_I:%.*]] = icmp ult i64 [[SUB_I]], [[IV]] +; CHECK-NEXT: br i1 [[INRANGE_I]], label [[ERROR_I:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT:%.*]] +; CHECK: error.i: +; CHECK-NEXT: tail call void @error() +; CHECK-NEXT: unreachable +; CHECK: at_with_int_conversion.exit: +; CHECK-NEXT: [[START_I2:%.*]] = load i64*, i64** [[GEP_START_I1]], align 8 +; CHECK-NEXT: [[END_I4:%.*]] = load i64*, i64** [[GEP_END_I3]], align 8 +; CHECK-NEXT: [[START_INT_I5:%.*]] = ptrtoint i64* [[START_I2]] to i64 +; CHECK-NEXT: [[END_INT_I6:%.*]] = ptrtoint i64* [[END_I4]] to i64 +; CHECK-NEXT: [[SUB_I7:%.*]] = sub i64 [[END_INT_I6]], [[START_INT_I5]] +; CHECK-NEXT: [[INRANGE_I8:%.*]] = icmp ult i64 [[SUB_I7]], [[IV]] +; CHECK-NEXT: br i1 [[INRANGE_I8]], label [[ERROR_I11:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT12]] +; CHECK: error.i11: +; CHECK-NEXT: tail call void @error() +; CHECK-NEXT: unreachable +; CHECK: at_with_int_conversion.exit12: +; CHECK-NEXT: [[GEP_IDX_I:%.*]] = getelementptr i64, i64* [[START_I]], i64 [[IV]] +; CHECK-NEXT: [[LV_I:%.*]] = load i64, i64* [[GEP_IDX_I]], align 4 +; CHECK-NEXT: [[GEP_IDX_I9:%.*]] = getelementptr i64, i64* [[START_I2]], i64 [[IV]] +; CHECK-NEXT: [[LV_I10:%.*]] = load i64, i64* [[GEP_IDX_I9]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[LV_I]], [[SUM]] +; CHECK-NEXT: [[SUM_NEXT]] = add i64 [[ADD]], [[LV_I10]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[C:%.*]] = icmp slt i64 [[IV]], [[N:%.*]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret i64 [[SUM_NEXT]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %sum = phi i64 [ 0, %entry ], [ %sum.next, %loop ] + %a = call i64 @at_with_int_conversion(%vec* %A, i64 %iv) + %b = call i64 @at_with_int_conversion(%vec* %B, i64 %iv) + %add = add i64 %a, %b + %sum.next = add i64 %sum, %add + %iv.next = add nuw nsw i64 %iv, 1 + %c = icmp slt i64 %iv, %N + br i1 %c, label %loop, label %exit + +exit: + ret i64 %sum.next +} + +define i64 @sum_3_at_with_int_conversion(%vec* %A, %vec* %B, %vec* %C, i64 %N) { +; CHECK-LABEL: @sum_3_at_with_int_conversion( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP_START_I:%.*]] = getelementptr [[VEC:%.*]], %vec* [[A:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[START_I:%.*]] = load i64*, i64** [[GEP_START_I]], align 8 +; CHECK-NEXT: [[GEP_END_I:%.*]] = getelementptr [[VEC]], %vec* [[A]], i64 0, i32 1 +; CHECK-NEXT: [[END_I:%.*]] = load i64*, i64** [[GEP_END_I]], align 8 +; CHECK-NEXT: [[START_INT_I:%.*]] = ptrtoint i64* [[START_I]] to i64 +; CHECK-NEXT: [[END_INT_I:%.*]] = ptrtoint i64* [[END_I]] to i64 +; CHECK-NEXT: [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]] +; CHECK-NEXT: [[GEP_START_I1:%.*]] = getelementptr [[VEC]], %vec* [[B:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[GEP_END_I3:%.*]] = getelementptr [[VEC]], %vec* [[B]], i64 0, i32 1 +; CHECK-NEXT: [[GEP_START_I13:%.*]] = getelementptr [[VEC]], %vec* [[C:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[GEP_END_I15:%.*]] = getelementptr [[VEC]], %vec* [[C]], i64 0, i32 1 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT24:%.*]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT24]] ] +; CHECK-NEXT: [[INRANGE_I:%.*]] = icmp ult i64 [[SUB_I]], [[IV]] +; CHECK-NEXT: br i1 [[INRANGE_I]], label [[ERROR_I:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT:%.*]] +; CHECK: error.i: +; CHECK-NEXT: tail call void @error() +; CHECK-NEXT: unreachable +; CHECK: at_with_int_conversion.exit: +; CHECK-NEXT: [[GEP_IDX_I:%.*]] = getelementptr i64, i64* [[START_I]], i64 [[IV]] +; CHECK-NEXT: [[LV_I:%.*]] = load i64, i64* [[GEP_IDX_I]], align 4 +; CHECK-NEXT: [[START_I2:%.*]] = load i64*, i64** [[GEP_START_I1]], align 8 +; CHECK-NEXT: [[END_I4:%.*]] = load i64*, i64** [[GEP_END_I3]], align 8 +; CHECK-NEXT: [[START_INT_I5:%.*]] = ptrtoint i64* [[START_I2]] to i64 +; CHECK-NEXT: [[END_INT_I6:%.*]] = ptrtoint i64* [[END_I4]] to i64 +; CHECK-NEXT: [[SUB_I7:%.*]] = sub i64 [[END_INT_I6]], [[START_INT_I5]] +; CHECK-NEXT: [[INRANGE_I8:%.*]] = icmp ult i64 [[SUB_I7]], [[IV]] +; CHECK-NEXT: br i1 [[INRANGE_I8]], label [[ERROR_I11:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT12:%.*]] +; CHECK: error.i11: +; CHECK-NEXT: tail call void @error() +; CHECK-NEXT: unreachable +; CHECK: at_with_int_conversion.exit12: +; CHECK-NEXT: [[START_I14:%.*]] = load i64*, i64** [[GEP_START_I13]], align 8 +; CHECK-NEXT: [[END_I16:%.*]] = load i64*, i64** [[GEP_END_I15]], align 8 +; CHECK-NEXT: [[START_INT_I17:%.*]] = ptrtoint i64* [[START_I14]] to i64 +; CHECK-NEXT: [[END_INT_I18:%.*]] = ptrtoint i64* [[END_I16]] to i64 +; CHECK-NEXT: [[SUB_I19:%.*]] = sub i64 [[END_INT_I18]], [[START_INT_I17]] +; CHECK-NEXT: [[INRANGE_I20:%.*]] = icmp ult i64 [[SUB_I19]], [[IV]] +; CHECK-NEXT: br i1 [[INRANGE_I20]], label [[ERROR_I23:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT24]] +; CHECK: error.i23: +; CHECK-NEXT: tail call void @error() +; CHECK-NEXT: unreachable +; CHECK: at_with_int_conversion.exit24: +; CHECK-NEXT: [[GEP_IDX_I9:%.*]] = getelementptr i64, i64* [[START_I2]], i64 [[IV]] +; CHECK-NEXT: [[LV_I10:%.*]] = load i64, i64* [[GEP_IDX_I9]], align 4 +; CHECK-NEXT: [[GEP_IDX_I21:%.*]] = getelementptr i64, i64* [[START_I14]], i64 [[IV]] +; CHECK-NEXT: [[LV_I22:%.*]] = load i64, i64* [[GEP_IDX_I21]], align 4 +; CHECK-NEXT: [[ADD_1:%.*]] = add i64 [[LV_I]], [[SUM]] +; CHECK-NEXT: [[ADD_2:%.*]] = add i64 [[ADD_1]], [[LV_I10]] +; CHECK-NEXT: [[SUM_NEXT]] = add i64 [[ADD_2]], [[LV_I22]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[IV]], [[N:%.*]] +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret i64 [[SUM_NEXT]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %sum = phi i64 [ 0, %entry ], [ %sum.next, %loop ] + %a = call i64 @at_with_int_conversion(%vec* %A, i64 %iv) + %b = call i64 @at_with_int_conversion(%vec* %B, i64 %iv) + %c = call i64 @at_with_int_conversion(%vec* %C, i64 %iv) + %add.1 = add i64 %a, %b + %add.2 = add i64 %add.1, %c + %sum.next = add i64 %sum, %add.2 + %iv.next = add nuw nsw i64 %iv, 1 + %cond = icmp slt i64 %iv, %N + br i1 %cond, label %loop, label %exit + +exit: + ret i64 %sum.next +} + + +define i64 @at_with_int_conversion(%vec* %ptr, i64 %idx) { +; CHECK-LABEL: @at_with_int_conversion( +; CHECK-NEXT: [[GEP_START:%.*]] = getelementptr [[VEC:%.*]], %vec* [[PTR:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[START:%.*]] = load i64*, i64** [[GEP_START]], align 8 +; CHECK-NEXT: [[GEP_END:%.*]] = getelementptr [[VEC]], %vec* [[PTR]], i64 0, i32 1 +; CHECK-NEXT: [[END:%.*]] = load i64*, i64** [[GEP_END]], align 8 +; CHECK-NEXT: [[START_INT:%.*]] = ptrtoint i64* [[START]] to i64 +; CHECK-NEXT: [[END_INT:%.*]] = ptrtoint i64* [[END]] to i64 +; CHECK-NEXT: [[SUB:%.*]] = sub i64 [[END_INT]], [[START_INT]] +; CHECK-NEXT: [[INRANGE:%.*]] = icmp ult i64 [[SUB]], [[IDX:%.*]] +; CHECK-NEXT: br i1 [[INRANGE]], label [[ERROR:%.*]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[GEP_IDX:%.*]] = getelementptr i64, i64* [[START]], i64 [[IDX]] +; CHECK-NEXT: [[LV:%.*]] = load i64, i64* [[GEP_IDX]], align 4 +; CHECK-NEXT: ret i64 [[LV]] +; CHECK: error: +; CHECK-NEXT: tail call void @error() +; CHECK-NEXT: unreachable +; + %gep.start = getelementptr %vec, %vec* %ptr, i64 0, i32 0 + %start = load i64*, i64** %gep.start + %gep.end = getelementptr %vec, %vec* %ptr, i64 0, i32 1 + %end = load i64*, i64** %gep.end + %start.int = ptrtoint i64* %start to i64 + %end.int = ptrtoint i64* %end to i64 + %sub = sub i64 %end.int, %start.int + %inrange = icmp ugt i64 %idx, %sub + br i1 %inrange, label %error, label %exit + +exit: + %gep.idx = getelementptr i64, i64* %start, i64 %idx + %lv = load i64, i64* %gep.idx + ret i64 %lv + +error: + call void @error() + unreachable +} + +declare void @error() + + From 9b19b778835f71dc65b442cb1d86463d78797f19 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 16 Aug 2021 09:21:39 +0100 Subject: [PATCH 098/700] [NFC] Remove unused code in llvm::createSimpleTargetReduction --- llvm/lib/Transforms/Utils/LoopUtils.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 51498548856d6..49dbb0439d31c 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -959,11 +959,6 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, const TargetTransformInfo *TTI, Value *Src, RecurKind RdxKind, ArrayRef RedOps) { - TargetTransformInfo::ReductionFlags RdxFlags; - RdxFlags.IsMaxOp = RdxKind == RecurKind::SMax || RdxKind == RecurKind::UMax || - RdxKind == RecurKind::FMax; - RdxFlags.IsSigned = RdxKind == RecurKind::SMax || RdxKind == RecurKind::SMin; - auto *SrcVecEltTy = cast(Src->getType())->getElementType(); switch (RdxKind) { case RecurKind::Add: From 7313a6d87c04c33f0bc67297241e33f2d82a0d5d Mon Sep 17 00:00:00 2001 From: "Kazushi (Jam) Marukawa" Date: Fri, 13 Aug 2021 19:30:44 +0900 Subject: [PATCH 099/700] [CLANG][PATCH][FPEnv] Add support for option -ffp-eval-method and extend #pragma float_control similarly Need to update a clang regression test for VE after https://reviews.llvm.org/D93769. Reviewed By: simoll Differential Revision: https://reviews.llvm.org/D108069 --- clang/test/Preprocessor/init-ve.c | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/test/Preprocessor/init-ve.c b/clang/test/Preprocessor/init-ve.c index 4686315f4ea06..b3ff47d54c131 100644 --- a/clang/test/Preprocessor/init-ve.c +++ b/clang/test/Preprocessor/init-ve.c @@ -32,7 +32,6 @@ // VE:#define __FLT_DENORM_MIN__ 1.40129846e-45F // VE:#define __FLT_DIG__ 6 // VE:#define __FLT_EPSILON__ 1.19209290e-7F -// VE:#define __FLT_EVAL_METHOD__ 0 // VE:#define __FLT_HAS_DENORM__ 1 // VE:#define __FLT_HAS_INFINITY__ 1 // VE:#define __FLT_HAS_QUIET_NAN__ 1 From 52cac541d4316a516632f6b71ab6d0a1d7c2be37 Mon Sep 17 00:00:00 2001 From: AndreyChurbanov Date: Mon, 16 Aug 2021 13:39:23 +0300 Subject: [PATCH 100/700] [OpenMP] libomp: cleanup: minor fixes to silence static analyzer. Added couple more checks to silence KlocWork static code analyzer. Differential Revision: https://reviews.llvm.org/D107348 --- openmp/runtime/src/kmp_affinity.cpp | 5 ++++- openmp/runtime/src/kmp_alloc.cpp | 2 +- openmp/runtime/src/kmp_dispatch.cpp | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp index d0a70d292a51c..404586487f4be 100644 --- a/openmp/runtime/src/kmp_affinity.cpp +++ b/openmp/runtime/src/kmp_affinity.cpp @@ -2386,7 +2386,10 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line, unsigned val; if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; - KMP_ASSERT(nodeIdIndex + level <= maxIndex); + // validate the input before using level: + if (level > (unsigned)__kmp_xproc) { // level is too big + level = __kmp_xproc; + } if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; threadInfo[num_avail][nodeIdIndex + level] = val; diff --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp index d379e71f921e9..a67d298ec756f 100644 --- a/openmp/runtime/src/kmp_alloc.cpp +++ b/openmp/runtime/src/kmp_alloc.cpp @@ -883,7 +883,7 @@ static void bpool(kmp_info_t *th, void *buf, bufsize len) { __kmp_bget_dequeue(th); /* Release any queued buffers */ #ifdef SizeQuant - len &= ~(SizeQuant - 1); + len &= ~((bufsize)(SizeQuant - 1)); #endif if (thr->pool_len == 0) { thr->pool_len = len; diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp index cc2d0012bf38d..c97ffb2dd336c 100644 --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -668,6 +668,8 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, case kmp_sch_static_chunked: case kmp_sch_dynamic_chunked: dynamic_init: + if (tc == 0) + break; if (pr->u.p.parm1 <= 0) pr->u.p.parm1 = KMP_DEFAULT_CHUNK; else if (pr->u.p.parm1 > tc) From 38c3cebd7d5aab0a27fb78d10bf3c783d760b186 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 16 Aug 2021 11:48:25 +0100 Subject: [PATCH 101/700] [LoopPeel] Add test with multiple exit blocks branching to unreachable. Add test as suggested by @ebedev.ri in D108108. --- .../peel-multiple-unreachable-exits.ll | 86 +++++++++++++++++-- 1 file changed, 80 insertions(+), 6 deletions(-) diff --git a/llvm/test/Transforms/LoopUnroll/peel-multiple-unreachable-exits.ll b/llvm/test/Transforms/LoopUnroll/peel-multiple-unreachable-exits.ll index 0a562900405cd..435a8015010de 100644 --- a/llvm/test/Transforms/LoopUnroll/peel-multiple-unreachable-exits.ll +++ b/llvm/test/Transforms/LoopUnroll/peel-multiple-unreachable-exits.ll @@ -3,8 +3,8 @@ declare void @foo() -define void @unroll_unreachable_exit_and_latch_exit(i32* %ptr, i32 %N, i32 %x) { -; CHECK-LABEL: @unroll_unreachable_exit_and_latch_exit( +define void @peel_unreachable_exit_and_latch_exit(i32* %ptr, i32 %N, i32 %x) { +; CHECK-LABEL: @peel_unreachable_exit_and_latch_exit( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: @@ -60,8 +60,8 @@ unreachable.exit: unreachable } -define void @unroll_unreachable_exit_and_header_exit(i32* %ptr, i32 %N, i32 %x) { -; CHECK-LABEL: @unroll_unreachable_exit_and_header_exit( +define void @peel_unreachable_exit_and_header_exit(i32* %ptr, i32 %N, i32 %x) { +; CHECK-LABEL: @peel_unreachable_exit_and_header_exit( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: @@ -105,8 +105,8 @@ unreachable.exit: unreachable } -define void @unroll_unreachable_and_multiple_reachable_exits(i32* %ptr, i32 %N, i32 %x) { -; CHECK-LABEL: @unroll_unreachable_and_multiple_reachable_exits( +define void @peel_unreachable_and_multiple_reachable_exits(i32* %ptr, i32 %N, i32 %x) { +; CHECK-LABEL: @peel_unreachable_and_multiple_reachable_exits( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: @@ -163,3 +163,77 @@ unreachable.exit: call void @foo() unreachable } + +define void @peel_exits_to_blocks_branch_to_unreachable_block(i32* %ptr, i32 %N, i32 %x, i1 %c.1) { +; CHECK-LABEL: @peel_exits_to_blocks_branch_to_unreachable_block( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], 2 +; CHECK-NEXT: br i1 [[C]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: br i1 [[C_1:%.*]], label [[EXIT_1:%.*]], label [[LOOP_LATCH]] +; CHECK: else: +; CHECK-NEXT: [[C_2:%.*]] = icmp eq i32 [[IV]], [[X:%.*]] +; CHECK-NEXT: br i1 [[C_2]], label [[EXIT_2:%.*]], label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[M:%.*]] = phi i32 [ 0, [[THEN]] ], [ [[X]], [[ELSE]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[IV]] +; CHECK-NEXT: store i32 [[M]], i32* [[GEP]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[C_3:%.*]] = icmp ult i32 [[IV]], 1000 +; CHECK-NEXT: br i1 [[C_3]], label [[LOOP_HEADER]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: exit.1: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label [[UNREACHABLE_TERM:%.*]] +; CHECK: exit.2: +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: br label [[UNREACHABLE_TERM]] +; CHECK: unreachable.term: +; CHECK-NEXT: call void @baz() +; CHECK-NEXT: unreachable +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ] + %c = icmp ult i32 %iv, 2 + br i1 %c, label %then, label %else + +then: + br i1 %c.1, label %exit.1, label %loop.latch + +else: + %c.2 = icmp eq i32 %iv, %x + br i1 %c.2, label %exit.2, label %loop.latch + +loop.latch: + %m = phi i32 [ 0, %then ], [ %x, %else ] + %gep = getelementptr i32, i32* %ptr, i32 %iv + store i32 %m, i32* %gep + %iv.next = add nuw nsw i32 %iv, 1 + %c.3 = icmp ult i32 %iv, 1000 + br i1 %c.3, label %loop.header, label %exit + +exit: + ret void + +exit.1: + call void @foo() + br label %unreachable.term + +exit.2: + call void @bar() + br label %unreachable.term + +unreachable.term: + call void @baz() + unreachable +} + +declare void @bar() +declare void @baz() From 79aed89ea3af606901ff716c38796bee43a765c0 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Mon, 16 Aug 2021 13:08:35 +0200 Subject: [PATCH 102/700] tsan: fix unused var warnings in a test Reviewed By: melver Differential Revision: https://reviews.llvm.org/D108118 --- compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp b/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp index 6e598323345ea..571fc0ab04b87 100644 --- a/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp +++ b/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp @@ -152,6 +152,7 @@ TEST(Trace, RestoreMutexLock) { uptr tag = kExternalTagNone; bool res = RestoreStack(thr->tid, v3::EventType::kLock, thr->sid, thr->epoch, 0x5001, 0, 0, &stk, &mset, &tag); + CHECK(res); CHECK_EQ(stk.size, 2); CHECK_EQ(stk.trace[0], 0x1000); CHECK_EQ(stk.trace[1], 0x4002); @@ -194,6 +195,7 @@ TEST(Trace, MultiPart) { bool res = RestoreStack(thr->tid, v3::EventType::kAccessExt, thr->sid, thr->epoch, 0x3000, 8, kAccessRead, &stk, &mset, &tag); + CHECK(res); CHECK_EQ(stk.size, 4); CHECK_EQ(stk.trace[0], 0x1000); CHECK_EQ(stk.trace[1], 0x2000); From febcedf18c75c603d6336b2373dcd6f01bc8028d Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Mon, 16 Aug 2021 14:30:00 +0300 Subject: [PATCH 103/700] Revert "[NFCI][IndVars] rewriteLoopExitValues(): nowadays SCEV should not change `GEP` base pointer" https://bugs.llvm.org/show_bug.cgi?id=51490 was filed. This reverts commit 35a8bdc775817ce13a6c9b5cf81502052634aa1f. --- llvm/lib/Transforms/Utils/LoopUtils.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 49dbb0439d31c..661b99f112117 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1346,9 +1346,6 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, // FIXME: isValidRewrite() is a hack. it should be an assert, eventually. Phi.ValidRewrite = isValidRewrite(SE, Phi.ExpansionPoint, Phi.Expansion); - assert(Phi.ValidRewrite && - "Now that the SCEV is strict wrt pointer/integer types, this " - "invariant is expected to be uphold by SCEV itself."); if (!Phi.ValidRewrite) { DeadInsts.push_back(Phi.Expansion); continue; From 70ab32d38802fdc3f8eb0ea5cdf9e5fc7904e985 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 16 Aug 2021 12:30:52 +0100 Subject: [PATCH 104/700] [InstCombine] Regenerate AddOverFlow.ll test checks. --- llvm/test/Transforms/InstCombine/AddOverFlow.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/Transforms/InstCombine/AddOverFlow.ll b/llvm/test/Transforms/InstCombine/AddOverFlow.ll index 13494206f078a..3b7babab6e144 100644 --- a/llvm/test/Transforms/InstCombine/AddOverFlow.ll +++ b/llvm/test/Transforms/InstCombine/AddOverFlow.ll @@ -49,9 +49,9 @@ declare i32 @__gxx_personality_v0(...); define i16 @add_bounded_values(i16 %a, i16 %b) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { ; CHECK-LABEL: @add_bounded_values( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[C:%.*]] = call i16 @bounded(i16 [[A:%.*]]), !range !0 +; CHECK-NEXT: [[C:%.*]] = call i16 @bounded(i16 [[A:%.*]]), !range [[RNG0:![0-9]+]] ; CHECK-NEXT: [[D:%.*]] = invoke i16 @bounded(i16 [[B:%.*]]) -; CHECK-NEXT: to label [[CONT:%.*]] unwind label [[LPAD:%.*]], !range !0 +; CHECK-NEXT: to label [[CONT:%.*]] unwind label [[LPAD:%.*]], !range [[RNG0]] ; CHECK: cont: ; CHECK-NEXT: [[E:%.*]] = add nuw i16 [[C]], [[D]] ; CHECK-NEXT: ret i16 [[E]] @@ -76,9 +76,9 @@ lpad: define i16 @add_bounded_values_2(i16 %a, i16 %b) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { ; CHECK-LABEL: @add_bounded_values_2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[C:%.*]] = call i16 @bounded(i16 [[A:%.*]]), !range !1 +; CHECK-NEXT: [[C:%.*]] = call i16 @bounded(i16 [[A:%.*]]), !range [[RNG1:![0-9]+]] ; CHECK-NEXT: [[D:%.*]] = invoke i16 @bounded(i16 [[B:%.*]]) -; CHECK-NEXT: to label [[CONT:%.*]] unwind label [[LPAD:%.*]], !range !1 +; CHECK-NEXT: to label [[CONT:%.*]] unwind label [[LPAD:%.*]], !range [[RNG1]] ; CHECK: cont: ; CHECK-NEXT: [[E:%.*]] = add i16 [[C]], [[D]] ; CHECK-NEXT: ret i16 [[E]] From a7dc71130ffb41195e2efac2221692875144c18f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 16 Aug 2021 12:57:05 +0100 Subject: [PATCH 105/700] [InstCombine] Add PR38021 nuw test case. --- llvm/test/Transforms/InstCombine/AddOverFlow.ll | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/AddOverFlow.ll b/llvm/test/Transforms/InstCombine/AddOverFlow.ll index 3b7babab6e144..e34e516c8b3ed 100644 --- a/llvm/test/Transforms/InstCombine/AddOverFlow.ll +++ b/llvm/test/Transforms/InstCombine/AddOverFlow.ll @@ -264,3 +264,14 @@ define i16 @ripple_no_nsw6(i16 %x, i16 %y) { %c = add i16 %b, %a ret i16 %c } + +define i8 @PR38021(i8 %x) { +; CHECK-LABEL: @PR38021( +; CHECK-NEXT: [[CLEAR_TOP_3_BITS:%.*]] = lshr i8 [[X:%.*]], 3 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i8 [[CLEAR_TOP_3_BITS]], -63 +; CHECK-NEXT: ret i8 [[ADD]] +; + %clear_top_3_bits = lshr i8 %x, 3 + %add = add i8 %clear_top_3_bits, 193 ; 0b11000001 + ret i8 %add +} From 2d45e332ba321a22996a1584af26d568b375b674 Mon Sep 17 00:00:00 2001 From: "tashuang.zk" Date: Mon, 16 Aug 2021 13:41:55 +0200 Subject: [PATCH 106/700] [MLIR][DISC] Revise ParallelLoopTilingPass with inbound_check mode Expand ParallelLoopTilingPass with an inbound_check mode. In default mode, the upper bound of the inner loop is from the min op; in inbound_check mode, the upper bound of the inner loop is the step of the outer loop and an additional inbound check will be emitted inside of the inner loop. This was 'FIXME' in the original codes and a typical usage is for GPU backends, thus the outer loop and inner loop can be mapped to blocks/threads in seperate. Differential Revision: https://reviews.llvm.org/D105455 --- mlir/include/mlir/Dialect/SCF/Passes.h | 7 +- mlir/include/mlir/Dialect/SCF/Passes.td | 6 +- mlir/include/mlir/Dialect/SCF/Transforms.h | 3 +- .../SCF/Transforms/ParallelLoopTiling.cpp | 93 ++++++++--- .../parallel-loop-tiling-inbound-check.mlir | 149 ++++++++++++++++++ 5 files changed, 237 insertions(+), 21 deletions(-) create mode 100644 mlir/test/Dialect/SCF/parallel-loop-tiling-inbound-check.mlir diff --git a/mlir/include/mlir/Dialect/SCF/Passes.h b/mlir/include/mlir/Dialect/SCF/Passes.h index f8ed2c429b47f..df6a27232a0d3 100644 --- a/mlir/include/mlir/Dialect/SCF/Passes.h +++ b/mlir/include/mlir/Dialect/SCF/Passes.h @@ -36,8 +36,13 @@ std::unique_ptr createParallelLoopFusionPass(); std::unique_ptr createParallelLoopSpecializationPass(); /// Creates a pass which tiles innermost parallel loops. +/// If noMinMaxBounds, the upper bound of the inner loop will +/// be a same value among different outter loop iterations, and +/// an additional inbound check will be emitted inside the internal +/// loops. std::unique_ptr -createParallelLoopTilingPass(llvm::ArrayRef tileSize = {}); +createParallelLoopTilingPass(llvm::ArrayRef tileSize = {}, + bool noMinMaxBounds = false); /// Creates a pass which folds arith ops on induction variable into /// loop range. diff --git a/mlir/include/mlir/Dialect/SCF/Passes.td b/mlir/include/mlir/Dialect/SCF/Passes.td index 5e2a3a81bc0f0..44a7617ae4d97 100644 --- a/mlir/include/mlir/Dialect/SCF/Passes.td +++ b/mlir/include/mlir/Dialect/SCF/Passes.td @@ -47,7 +47,11 @@ def SCFParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> { let options = [ ListOption<"tileSizes", "parallel-loop-tile-sizes", "int64_t", "Factors to tile parallel loops by", - "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated"> + "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">, + Option<"noMinMaxBounds", "no-min-max-bounds", "bool", + /*default=*/"false", + "Perform tiling with fixed upper bound with inbound check " + "inside the internal loops"> ]; let dependentDialects = ["AffineDialect"]; } diff --git a/mlir/include/mlir/Dialect/SCF/Transforms.h b/mlir/include/mlir/Dialect/SCF/Transforms.h index 5cb816c808fa1..603fd3f0e9c30 100644 --- a/mlir/include/mlir/Dialect/SCF/Transforms.h +++ b/mlir/include/mlir/Dialect/SCF/Transforms.h @@ -87,7 +87,8 @@ LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, scf::IfOp &ifOp); /// The function returns the resulting ParallelOps, i.e. {outer_loop_op, /// inner_loop_op}. std::pair -tileParallelLoop(ParallelOp op, llvm::ArrayRef tileSizes); +tileParallelLoop(ParallelOp op, llvm::ArrayRef tileSizes, + bool noMinMaxBounds); /// Populates patterns for SCF structural type conversions and sets up the /// provided ConversionTarget with the appropriate legality configuration for diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp index 8282c0771f302..af8dcaf4b5c38 100644 --- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp @@ -33,12 +33,25 @@ using namespace mlir::scf; /// min(%arg5*tileSize[1], %arg3-%i1)) /// step (%arg4, %arg5) /// +/// or, when no-min-max-bounds is true, into +/// scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) +/// step (%arg4*tileSize[0], +/// %arg5*tileSize[1]) +/// scf.parallel (%j0, %j1) = (0, 0) to (%arg4*tileSize[0], +/// %arg5*tileSize[1]) +/// step (%arg4, %arg5) +/// %inbound = (%j0 * %arg4 + %i0 < %arg2) && +/// (%j1 * %arg5 + %i1 < %arg3) +/// scf.if (%inbound) +/// .... +/// /// where the uses of %i0 and %i1 in the loop body are replaced by /// %i0 + j0 and %i1 + %j1. // /// The old loop is replaced with the new one. std::pair -mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef tileSizes) { +mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef tileSizes, + bool noMinMaxBounds) { OpBuilder b(op); auto zero = b.create(op.getLoc(), 0); SmallVector tileSizeConstants; @@ -64,8 +77,6 @@ mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef tileSizes) { b.setInsertionPointToStart(outerLoop.getBody()); // Compute min(size, dim - offset) to avoid out-of-bounds accesses. - // FIXME: Instead of using min, we want to replicate the tail. This would give - // the inner loop constant bounds for easy vectorization. auto minMap = AffineMap::get( /*dimCount=*/3, /*symbolCount=*/0, {getAffineDimExpr(/*position=*/0, b.getContext()), @@ -76,6 +87,7 @@ mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef tileSizes) { // Create the inner loop with adjusted bounds. SmallVector newBounds; newBounds.reserve(op.upperBound().size()); + bool needInboundCheck = false; for (auto dim : llvm::zip(outerLoop.lowerBound(), outerLoop.upperBound(), outerLoop.step(), outerLoop.getInductionVars(), op.step(), tileSizeConstants)) { @@ -101,6 +113,14 @@ mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef tileSizes) { continue; } } + + // For InboundCheck mode, just use the variable outer step + if (noMinMaxBounds) { + newBounds.push_back(newStep); + needInboundCheck = true; + continue; + } + // Otherwise, we dynamically compute the bound for // each iteration of the outer loop. newBounds.push_back( @@ -111,17 +131,51 @@ mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef tileSizes) { op.getLoc(), SmallVector(newBounds.size(), zero), newBounds, op.step()); - // Steal the body of the old parallel loop and erase it. - innerLoop.region().takeBody(op.region()); - - // Insert computation for new index vectors and replace uses. - b.setInsertionPointToStart(innerLoop.getBody()); - for (auto ivs : - llvm::zip(innerLoop.getInductionVars(), outerLoop.getInductionVars())) { - Value inner_index = std::get<0>(ivs); - AddIOp newIndex = - b.create(op.getLoc(), std::get<0>(ivs), std::get<1>(ivs)); - inner_index.replaceAllUsesExcept(newIndex, newIndex); + if (noMinMaxBounds && needInboundCheck) { + b.setInsertionPointToStart(innerLoop.getBody()); + // Insert in-bound check + Value inbound = + b.create(op.getLoc(), b.getIntegerType(1), + b.getIntegerAttr(b.getIntegerType(1), 1)); + for (auto dim : + llvm::zip(outerLoop.upperBound(), outerLoop.getInductionVars(), + innerLoop.getInductionVars(), innerLoop.step())) { + Value outerUpperBound, outerIV, innerIV, innerStep; + std::tie(outerUpperBound, outerIV, innerIV, innerStep) = dim; + // %in_bound = %in_bound && + // (%inner_iv * %inner_step + %outer_iv < %outer_upper_bound) + Value index = b.create( + op.getLoc(), b.create(op.getLoc(), innerIV, innerStep), + outerIV); + Value dimInbound = b.create(op.getLoc(), CmpIPredicate::ult, + index, outerUpperBound); + inbound = b.create(op.getLoc(), inbound, dimInbound); + } + auto ifInbound = b.create(op.getLoc(), + /*resultTypes*/ ArrayRef{}, inbound, + /*hasElseRegion*/ false); + ifInbound.thenRegion().takeBody(op.region()); + Block &thenBlock = ifInbound.thenRegion().front(); + b.setInsertionPointToStart(innerLoop.getBody()); + for (auto ivs : llvm::enumerate(llvm::zip(innerLoop.getInductionVars(), + outerLoop.getInductionVars()))) { + AddIOp newIndex = b.create(op.getLoc(), std::get<0>(ivs.value()), + std::get<1>(ivs.value())); + thenBlock.getArgument(ivs.index()) + .replaceAllUsesExcept(newIndex, newIndex); + } + thenBlock.eraseArguments(llvm::to_vector<4>( + llvm::seq((unsigned)0, thenBlock.getNumArguments()))); + } else { + innerLoop.region().takeBody(op.region()); + b.setInsertionPointToStart(innerLoop.getBody()); + for (auto ivs : llvm::zip(innerLoop.getInductionVars(), + outerLoop.getInductionVars())) { + Value innerIndex = std::get<0>(ivs); + AddIOp newIndex = + b.create(op.getLoc(), std::get<0>(ivs), std::get<1>(ivs)); + innerIndex.replaceAllUsesExcept(newIndex, newIndex); + } } op.erase(); @@ -132,8 +186,10 @@ namespace { struct ParallelLoopTiling : public SCFParallelLoopTilingBase { ParallelLoopTiling() = default; - explicit ParallelLoopTiling(ArrayRef tileSizes) { + explicit ParallelLoopTiling(ArrayRef tileSizes, + bool noMinMaxBounds = false) { this->tileSizes = tileSizes; + this->noMinMaxBounds = noMinMaxBounds; } void runOnFunction() override { @@ -142,13 +198,14 @@ struct ParallelLoopTiling for (ParallelOp ploop : innermostPloops) { // FIXME: Add reduction support. if (ploop.getNumReductions() == 0) - tileParallelLoop(ploop, tileSizes); + tileParallelLoop(ploop, tileSizes, noMinMaxBounds); } } }; } // namespace std::unique_ptr -mlir::createParallelLoopTilingPass(ArrayRef tileSizes) { - return std::make_unique(tileSizes); +mlir::createParallelLoopTilingPass(ArrayRef tileSizes, + bool noMinMaxBounds) { + return std::make_unique(tileSizes, noMinMaxBounds); } diff --git a/mlir/test/Dialect/SCF/parallel-loop-tiling-inbound-check.mlir b/mlir/test/Dialect/SCF/parallel-loop-tiling-inbound-check.mlir new file mode 100644 index 0000000000000..8f395c3b2828a --- /dev/null +++ b/mlir/test/Dialect/SCF/parallel-loop-tiling-inbound-check.mlir @@ -0,0 +1,149 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.func(parallel-loop-tiling{parallel-loop-tile-sizes=1,4 no-min-max-bounds=true})' -split-input-file | FileCheck %s + +func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, %arg4 : index, %arg5 : index, + %A: memref, %B: memref, + %C: memref, %result: memref) { + scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) step (%arg4, %arg5) { + %B_elem = memref.load %B[%i0, %i1] : memref + %C_elem = memref.load %C[%i0, %i1] : memref + %sum_elem = addf %B_elem, %C_elem : f32 + memref.store %sum_elem, %result[%i0, %i1] : memref + } + return +} + +// CHECK-LABEL: func @parallel_loop( +// CHECK-SAME: [[ARG1:%.*]]: index, [[ARG2:%.*]]: index, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index, [[ARG5:%.*]]: index, [[ARG6:%.*]]: index, [[ARG7:%.*]]: memref, [[ARG8:%.*]]: memref, [[ARG9:%.*]]: memref, [[ARG10:%.*]]: memref) { +// CHECK: [[C0:%.*]] = constant 0 : index +// CHECK: [[C1:%.*]] = constant 1 : index +// CHECK: [[C4:%.*]] = constant 4 : index +// CHECK: [[V1:%.*]] = muli [[ARG5]], [[C1]] : index +// CHECK: [[V2:%.*]] = muli [[ARG6]], [[C4]] : index +// CHECK: scf.parallel ([[V3:%.*]], [[V4:%.*]]) = ([[ARG1]], [[ARG2]]) to ([[ARG3]], [[ARG4]]) step ([[V1]], [[V2]]) { +// CHECK: scf.parallel ([[V7:%.*]], [[V8:%.*]]) = ([[C0]], [[C0]]) to ([[V1]], [[V2]]) step ([[ARG5]], [[ARG6]]) { +// CHECK: [[V9:%.*]] = addi [[V7]], [[V3]] : index +// CHECK: [[V10:%.*]] = addi [[V8]], [[V4]] : index +// CHECK: %true = constant true +// CHECK: [[V11:%.*]] = muli [[V7]], [[ARG5]] : index +// CHECK: [[V12:%.*]] = addi [[V11]], [[V3]] : index +// CHECK: [[V13:%.*]] = cmpi ult, [[V12]], [[ARG3]] : index +// CHECK: [[V14:%.*]] = and %true, [[V13]] : i1 +// CHECK: [[V15:%.*]] = muli [[V8]], [[ARG6]] : index +// CHECK: [[V16:%.*]] = addi [[V15]], [[V4]] : index +// CHECK: [[V17:%.*]] = cmpi ult, [[V16]], [[ARG4]] : index +// CHECK: [[V18:%.*]] = and [[V14]], [[V17]] : i1 +// CHECK: scf.if [[V18]] { +// CHECK: [[V19:%.*]] = memref.load [[ARG8]]{{\[}}[[V9]], [[V10]]] : memref +// CHECK: [[V20:%.*]] = memref.load [[ARG9]]{{\[}}[[V9]], [[V10]]] : memref +// CHECK: [[V21:%.*]] = addf [[V19]], [[V20]] : f32 +// CHECK: memref.store [[V21]], [[ARG10]]{{\[}}[[V9]], [[V10]]] : memref +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: return + +// ----- + +func @static_loop_with_step() { + %c0 = constant 0 : index + %c3 = constant 3 : index + %c22 = constant 22 : index + %c24 = constant 24 : index + scf.parallel (%i0, %i1) = (%c0, %c0) to (%c22, %c24) step (%c3, %c3) { + } + return +} + +// CHECK-LABEL: func @static_loop_with_step() { +// CHECK: [[C0:%.*]] = constant 0 : index +// CHECK: [[C3:%.*]] = constant 3 : index +// CHECK: [[C22:%.*]] = constant 22 : index +// CHECK: [[C24:%.*]] = constant 24 : index +// CHECK: [[C0_1:%.*]] = constant 0 : index +// CHECK: [[C1:%.*]] = constant 1 : index +// CHECK: [[C4:%.*]] = constant 4 : index +// CHECK: [[V1:%.*]] = muli [[C3]], [[C1]] : index +// CHECK: [[V2:%.*]] = muli [[C3]], [[C4]] : index +// CHECK: scf.parallel ([[V3:%.*]], [[V4:%.*]]) = ([[C0]], [[C0]]) to ([[C22]], [[C24]]) step ([[V1]], [[V2]]) { +// CHECK: scf.parallel ([[V5:%.*]], [[V6:%.*]]) = ([[C0_1]], [[C0_1]]) to ([[V1]], [[V2]]) step ([[C3]], [[C3]]) { +// CHECK-NOT: scf.if +// CHECK: = addi [[V5]], [[V3]] : index +// CHECK: = addi [[V6]], [[V4]] : index +// CHECK: } +// CHECK: } +// CHECK: return + +// ----- + +func @tile_nested_innermost() { + %c2 = constant 2 : index + %c0 = constant 0 : index + %c1 = constant 1 : index + scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) { + scf.parallel (%k, %l) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) { + } + } + scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) { + } + return +} + +// CHECK-LABEL: func @tile_nested_innermost() { +// CHECK: [[C2:%.*]] = constant 2 : index +// CHECK: [[C0:%.*]] = constant 0 : index +// CHECK: [[C1:%.*]] = constant 1 : index +// CHECK: scf.parallel ([[V1:%.*]], [[V2:%.*]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[C1]], [[C1]]) { +// CHECK: [[C0_1:%.*]] = constant 0 : index +// CHECK: [[C1_1:%.*]] = constant 1 : index +// CHECK: [[C4:%.*]] = constant 4 : index +// CHECK: [[V3:%.*]] = muli [[C1]], [[C1_1]] : index +// CHECK: [[V4:%.*]] = muli [[C1]], [[C4]] : index +// CHECK: scf.parallel ([[V5:%.*]], [[V6:%.*]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[V3]], [[V4]]) { +// CHECK: scf.parallel ([[V8:%.*]], [[V9:%.*]]) = ([[C0_1]], [[C0_1]]) to ([[V3]], [[V4]]) step ([[C1]], [[C1]]) { +// CHECK: = addi [[V8]], [[V5]] : index +// CHECK: = addi [[V9]], [[V6]] : index +// CHECK: scf.if +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: [[C0_2:%.*]] = constant 0 : index +// CHECK: [[C1_2:%.*]] = constant 1 : index +// CHECK: [[C4_1:%.*]] = constant 4 : index +// CHECK: [[V10:%.*]] = muli [[C1]], [[C1_2]] : index +// CHECK: [[V11:%.*]] = muli [[C1]], [[C4_1]] : index +// CHECK: scf.parallel ([[V12:%.*]], [[V13:%.*]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[V10]], [[V11]]) { +// CHECK: scf.parallel ([[V15:%.*]], [[V16:%.*]]) = ([[C0_2]], [[C0_2]]) to ([[V10]], [[V11]]) step ([[C1]], [[C1]]) { +// CHECK: = addi [[V15]], [[V12]] : index +// CHECK: = addi [[V16]], [[V13]] : index +// CHECK: scf.if +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// ----- + +func @tile_nested_in_non_ploop() { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c2 = constant 2 : index + scf.for %i = %c0 to %c2 step %c1 { + scf.for %j = %c0 to %c2 step %c1 { + scf.parallel (%k, %l) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) { + } + } + } + return +} + +// CHECK-LABEL: func @tile_nested_in_non_ploop +// CHECK: scf.for +// CHECK: scf.for +// CHECK: scf.parallel +// CHECK: scf.parallel +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: } From 2c5c06c5cfca7988b2d69a78459be27beb35a86f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 16 Aug 2021 13:12:13 +0100 Subject: [PATCH 107/700] [X86] Add PR46315 test case --- llvm/test/CodeGen/X86/pr46315.ll | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 llvm/test/CodeGen/X86/pr46315.ll diff --git a/llvm/test/CodeGen/X86/pr46315.ll b/llvm/test/CodeGen/X86/pr46315.ll new file mode 100644 index 0000000000000..e42c19fb404b6 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr46315.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +define void @PR46315() { +; CHECK-LABEL: PR46315: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl $2, %edx +; CHECK-NEXT: movl $3, %ecx +; CHECK-NEXT: movl $1, %r8d +; CHECK-NEXT: jmp h@PLT # TAILCALL + tail call void @h(i32 0, i32 0, i32 2, i32 3, i32 1) + ret void +} + +declare void @h(i32, i32, i32, i32, i32) From c019142a89b477cd247434c1d8f571662d26e19d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krist=C3=B3f=20Umann?= Date: Wed, 7 Jul 2021 11:58:26 +0200 Subject: [PATCH 108/700] [analyzer][NFC] Split the main logic of NoStoreFuncVisitor to an abstract NoStateChangeVisitor class Preceding discussion on cfe-dev: https://lists.llvm.org/pipermail/cfe-dev/2021-June/068450.html NoStoreFuncVisitor is a rather unique visitor. As VisitNode is invoked on most other visitors, they are looking for the point where something changed -- change on a value, some checker-specific GDM trait, a new constraint. NoStoreFuncVisitor, however, looks specifically for functions that *didn't* write to a MemRegion of interesting. Quoting from its comments: /// Put a diagnostic on return statement of all inlined functions /// for which the region of interest \p RegionOfInterest was passed into, /// but not written inside, and it has caused an undefined read or a null /// pointer dereference outside. It so happens that there are a number of other similar properties that are worth checking. For instance, if some memory leaks, it might be interesting why a function didn't take ownership of said memory: void sink(int *P) {} // no notes void f() { sink(new int(5)); // note: Memory is allocated // Well hold on, sink() was supposed to deal with // that, this must be a false positive... } // warning: Potential memory leak [cplusplus.NewDeleteLeaks] In here, the entity of interest isn't a MemRegion, but a symbol. The property that changed here isn't a change of value, but rather liveness and GDM traits managed by MalloChecker. This patch moves some of the logic of NoStoreFuncVisitor to a new abstract class, NoStateChangeFuncVisitor. This is mostly calculating and caching the stack frames in which the entity of interest wasn't changed. Descendants of this interface have to define 3 things: * What constitutes as a change to an entity (this is done by overriding wasModifiedBeforeCallExit) * What the diagnostic message should be (this is done by overriding maybeEmitNoteFor.*) * What constitutes as the entity of interest being passed into the function (this is also done by overriding maybeEmitNoteFor.*) Differential Revision: https://reviews.llvm.org/D105553 --- .../Core/BugReporter/BugReporterVisitors.h | 79 ++++ .../Core/BugReporterVisitors.cpp | 358 ++++++++++-------- 2 files changed, 269 insertions(+), 168 deletions(-) diff --git a/clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h b/clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h index 24cae12af24a1..139b0dcd51704 100644 --- a/clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h +++ b/clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h @@ -21,6 +21,7 @@ #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringRef.h" #include #include @@ -622,6 +623,84 @@ class TagVisitor : public BugReporterVisitor { PathSensitiveBugReport &R) override; }; +class ObjCMethodCall; +class CXXConstructorCall; + +/// Put a diagnostic on return statement (or on } in its absence) of all inlined +/// functions for which some property remained unchanged. +/// Resulting diagnostics may read such as "Returning without writing to X". +/// +/// Descendants can define what a "state change is", like a change of value +/// to a memory region, liveness, etc. For function calls where the state did +/// not change as defined, a custom note may be constructed. +class NoStateChangeFuncVisitor : public BugReporterVisitor { +private: + /// Frames modifying the state as defined in \c wasModifiedBeforeCallExit. + /// This visitor generates a note only if a function does *not* change the + /// state that way. This information is not immediately available + /// by looking at the node associated with the exit from the function + /// (usually the return statement). To avoid recomputing the same information + /// many times (going up the path for each node and checking whether the + /// region was written into) we instead lazily compute the stack frames + /// along the path. + llvm::SmallPtrSet FramesModifying; + llvm::SmallPtrSet FramesModifyingCalculated; + + /// Check and lazily calculate whether the state is modified in the stack + /// frame to which \p CallExitBeginN belongs. + /// The calculation is cached in FramesModifying. + bool isModifiedInFrame(const ExplodedNode *CallExitBeginN); + + /// Write to \c FramesModifying all stack frames along the path in the current + /// stack frame which modifies the state. + void findModifyingFrames(const ExplodedNode *const CallExitBeginN); + +protected: + bugreporter::TrackingKind TKind; + + /// \return Whether the state was modified from the current node, \CurrN, to + /// the end of the stack fram, at \p CallExitBeginN. + virtual bool + wasModifiedBeforeCallExit(const ExplodedNode *CurrN, + const ExplodedNode *CallExitBeginN) = 0; + + /// Consume the information on the non-modifying stack frame in order to + /// either emit a note or not. May suppress the report entirely. + /// \return Diagnostics piece for the unmodified state in the current + /// function, if it decides to emit one. A good description might start with + /// "Returning without...". + virtual PathDiagnosticPieceRef + maybeEmitNoteForObjCSelf(PathSensitiveBugReport &R, + const ObjCMethodCall &Call, + const ExplodedNode *N) = 0; + + /// Consume the information on the non-modifying stack frame in order to + /// either emit a note or not. May suppress the report entirely. + /// \return Diagnostics piece for the unmodified state in the current + /// function, if it decides to emit one. A good description might start with + /// "Returning without...". + virtual PathDiagnosticPieceRef + maybeEmitNoteForCXXThis(PathSensitiveBugReport &R, + const CXXConstructorCall &Call, + const ExplodedNode *N) = 0; + + /// Consume the information on the non-modifying stack frame in order to + /// either emit a note or not. May suppress the report entirely. + /// \return Diagnostics piece for the unmodified state in the current + /// function, if it decides to emit one. A good description might start with + /// "Returning without...". + virtual PathDiagnosticPieceRef + maybeEmitNoteForParameters(PathSensitiveBugReport &R, const CallEvent &Call, + const ExplodedNode *N) = 0; + +public: + NoStateChangeFuncVisitor(bugreporter::TrackingKind TKind) : TKind(TKind) {} + + PathDiagnosticPieceRef VisitNode(const ExplodedNode *N, + BugReporterContext &BR, + PathSensitiveBugReport &R) override final; +}; + } // namespace ento } // namespace clang diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp index d06a2d4933038..2c54bffabc43f 100644 --- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp +++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp @@ -343,46 +343,140 @@ BugReporterVisitor::getDefaultEndPath(const BugReporterContext &BRC, return P; } +//===----------------------------------------------------------------------===// +// Implementation of NoStateChangeFuncVisitor. +//===----------------------------------------------------------------------===// + +bool NoStateChangeFuncVisitor::isModifiedInFrame(const ExplodedNode *N) { + const LocationContext *Ctx = N->getLocationContext(); + const StackFrameContext *SCtx = Ctx->getStackFrame(); + if (!FramesModifyingCalculated.count(SCtx)) + findModifyingFrames(N); + return FramesModifying.count(SCtx); +} + +void NoStateChangeFuncVisitor::findModifyingFrames( + const ExplodedNode *const CallExitBeginN) { + + assert(CallExitBeginN->getLocationAs()); + const ExplodedNode *LastReturnN = CallExitBeginN; + const StackFrameContext *const OriginalSCtx = + CallExitBeginN->getLocationContext()->getStackFrame(); + + const ExplodedNode *CurrN = CallExitBeginN; + + do { + ProgramStateRef State = CurrN->getState(); + auto CallExitLoc = CurrN->getLocationAs(); + if (CallExitLoc) { + LastReturnN = CurrN; + } + + FramesModifyingCalculated.insert( + CurrN->getLocationContext()->getStackFrame()); + + if (wasModifiedBeforeCallExit(CurrN, LastReturnN)) { + const StackFrameContext *SCtx = CurrN->getStackFrame(); + while (!SCtx->inTopFrame()) { + auto p = FramesModifying.insert(SCtx); + if (!p.second) + break; // Frame and all its parents already inserted. + SCtx = SCtx->getParent()->getStackFrame(); + } + } + + // Stop calculation at the call to the current function. + if (auto CE = CurrN->getLocationAs()) + if (CE->getCalleeContext() == OriginalSCtx) + break; + + CurrN = CurrN->getFirstPred(); + } while (CurrN); +} + +PathDiagnosticPieceRef NoStateChangeFuncVisitor::VisitNode( + const ExplodedNode *N, BugReporterContext &BR, PathSensitiveBugReport &R) { + + const LocationContext *Ctx = N->getLocationContext(); + const StackFrameContext *SCtx = Ctx->getStackFrame(); + ProgramStateRef State = N->getState(); + auto CallExitLoc = N->getLocationAs(); + + // No diagnostic if region was modified inside the frame. + if (!CallExitLoc || isModifiedInFrame(N)) + return nullptr; + + CallEventRef<> Call = + BR.getStateManager().getCallEventManager().getCaller(SCtx, State); + + // Optimistically suppress uninitialized value bugs that result + // from system headers having a chance to initialize the value + // but failing to do so. It's too unlikely a system header's fault. + // It's much more likely a situation in which the function has a failure + // mode that the user decided not to check. If we want to hunt such + // omitted checks, we should provide an explicit function-specific note + // describing the precondition under which the function isn't supposed to + // initialize its out-parameter, and additionally check that such + // precondition can actually be fulfilled on the current path. + if (Call->isInSystemHeader()) { + // We make an exception for system header functions that have no branches. + // Such functions unconditionally fail to initialize the variable. + // If they call other functions that have more paths within them, + // this suppression would still apply when we visit these inner functions. + // One common example of a standard function that doesn't ever initialize + // its out parameter is operator placement new; it's up to the follow-up + // constructor (if any) to initialize the memory. + if (!N->getStackFrame()->getCFG()->isLinear()) { + static int i = 0; + R.markInvalid(&i, nullptr); + } + return nullptr; + } + + if (const auto *MC = dyn_cast(Call)) { + // If we failed to construct a piece for self, we still want to check + // whether the entity of interest is in a parameter. + if (PathDiagnosticPieceRef Piece = maybeEmitNoteForObjCSelf(R, *MC, N)) + return Piece; + } + + if (const auto *CCall = dyn_cast(Call)) { + // Do not generate diagnostics for not modified parameters in + // constructors. + return maybeEmitNoteForCXXThis(R, *CCall, N); + } + + return maybeEmitNoteForParameters(R, *Call, N); +} + //===----------------------------------------------------------------------===// // Implementation of NoStoreFuncVisitor. //===----------------------------------------------------------------------===// namespace { - /// Put a diagnostic on return statement of all inlined functions /// for which the region of interest \p RegionOfInterest was passed into, /// but not written inside, and it has caused an undefined read or a null /// pointer dereference outside. -class NoStoreFuncVisitor final : public BugReporterVisitor { +class NoStoreFuncVisitor final : public NoStateChangeFuncVisitor { const SubRegion *RegionOfInterest; MemRegionManager &MmrMgr; const SourceManager &SM; const PrintingPolicy &PP; - bugreporter::TrackingKind TKind; /// Recursion limit for dereferencing fields when looking for the /// region of interest. /// The limit of two indicates that we will dereference fields only once. static const unsigned DEREFERENCE_LIMIT = 2; - /// Frames writing into \c RegionOfInterest. - /// This visitor generates a note only if a function does not write into - /// a region of interest. This information is not immediately available - /// by looking at the node associated with the exit from the function - /// (usually the return statement). To avoid recomputing the same information - /// many times (going up the path for each node and checking whether the - /// region was written into) we instead lazily compute the - /// stack frames along the path which write into the region of interest. - llvm::SmallPtrSet FramesModifyingRegion; - llvm::SmallPtrSet FramesModifyingCalculated; - using RegionVector = SmallVector; public: NoStoreFuncVisitor(const SubRegion *R, bugreporter::TrackingKind TKind) - : RegionOfInterest(R), MmrMgr(R->getMemRegionManager()), + : NoStateChangeFuncVisitor(TKind), RegionOfInterest(R), + MmrMgr(R->getMemRegionManager()), SM(MmrMgr.getContext().getSourceManager()), - PP(MmrMgr.getContext().getPrintingPolicy()), TKind(TKind) {} + PP(MmrMgr.getContext().getPrintingPolicy()) {} void Profile(llvm::FoldingSetNodeID &ID) const override { static int Tag = 0; @@ -395,11 +489,13 @@ class NoStoreFuncVisitor final : public BugReporterVisitor { return static_cast(&Tag); } - PathDiagnosticPieceRef VisitNode(const ExplodedNode *N, - BugReporterContext &BR, - PathSensitiveBugReport &R) override; - private: + /// \return Whether \c RegionOfInterest was modified at \p CurrN compared to + /// the value it holds in \p CallExitBeginN. + virtual bool + wasModifiedBeforeCallExit(const ExplodedNode *CurrN, + const ExplodedNode *CallExitBeginN) override; + /// Attempts to find the region of interest in a given record decl, /// by either following the base classes or fields. /// Dereferences fields up to a given recursion limit. @@ -411,20 +507,21 @@ class NoStoreFuncVisitor final : public BugReporterVisitor { const MemRegion *R, const RegionVector &Vec = {}, int depth = 0); - /// Check and lazily calculate whether the region of interest is - /// modified in the stack frame to which \p N belongs. - /// The calculation is cached in FramesModifyingRegion. - bool isRegionOfInterestModifiedInFrame(const ExplodedNode *N) { - const LocationContext *Ctx = N->getLocationContext(); - const StackFrameContext *SCtx = Ctx->getStackFrame(); - if (!FramesModifyingCalculated.count(SCtx)) - findModifyingFrames(N); - return FramesModifyingRegion.count(SCtx); - } + // Region of interest corresponds to an IVar, exiting a method + // which could have written into that IVar, but did not. + virtual PathDiagnosticPieceRef + maybeEmitNoteForObjCSelf(PathSensitiveBugReport &R, + const ObjCMethodCall &Call, + const ExplodedNode *N) override final; + + virtual PathDiagnosticPieceRef + maybeEmitNoteForCXXThis(PathSensitiveBugReport &R, + const CXXConstructorCall &Call, + const ExplodedNode *N) override final; - /// Write to \c FramesModifyingRegion all stack frames along - /// the path in the current stack frame which modify \c RegionOfInterest. - void findModifyingFrames(const ExplodedNode *N); + virtual PathDiagnosticPieceRef + maybeEmitNoteForParameters(PathSensitiveBugReport &R, const CallEvent &Call, + const ExplodedNode *N) override final; /// Consume the information on the no-store stack frame in order to /// either emit a note or suppress the report enirely. @@ -436,22 +533,18 @@ class NoStoreFuncVisitor final : public BugReporterVisitor { const MemRegion *MatchedRegion, StringRef FirstElement, bool FirstIsReferenceType, unsigned IndirectionLevel); - /// Pretty-print region \p MatchedRegion to \p os. - /// \return Whether printing succeeded. - bool prettyPrintRegionName(StringRef FirstElement, bool FirstIsReferenceType, + bool prettyPrintRegionName(const RegionVector &FieldChain, const MemRegion *MatchedRegion, - const RegionVector &FieldChain, - int IndirectionLevel, + StringRef FirstElement, bool FirstIsReferenceType, + unsigned IndirectionLevel, llvm::raw_svector_ostream &os); - /// Print first item in the chain, return new separator. - static StringRef prettyPrintFirstElement(StringRef FirstElement, - bool MoreItemsExpected, - int IndirectionLevel, - llvm::raw_svector_ostream &os); + StringRef prettyPrintFirstElement(StringRef FirstElement, + bool MoreItemsExpected, + int IndirectionLevel, + llvm::raw_svector_ostream &os); }; - -} // end of anonymous namespace +} // namespace /// \return Whether the method declaration \p Parent /// syntactically has a binary operation writing into the ivar \p Ivar. @@ -486,25 +579,6 @@ static bool potentiallyWritesIntoIvar(const Decl *Parent, return false; } -/// Get parameters associated with runtime definition in order -/// to get the correct parameter name. -static ArrayRef getCallParameters(CallEventRef<> Call) { - // Use runtime definition, if available. - RuntimeDefinition RD = Call->getRuntimeDefinition(); - if (const auto *FD = dyn_cast_or_null(RD.getDecl())) - return FD->parameters(); - if (const auto *MD = dyn_cast_or_null(RD.getDecl())) - return MD->parameters(); - - return Call->parameters(); -} - -/// \return whether \p Ty points to a const type, or is a const reference. -static bool isPointerToConst(QualType Ty) { - return !Ty->getPointeeType().isNull() && - Ty->getPointeeType().getCanonicalType().isConstQualified(); -} - /// Attempts to find the region of interest in a given CXX decl, /// by either following the base classes or fields. /// Dereferences fields up to a given recursion limit. @@ -564,68 +638,66 @@ NoStoreFuncVisitor::findRegionOfInterestInRecord( } PathDiagnosticPieceRef -NoStoreFuncVisitor::VisitNode(const ExplodedNode *N, BugReporterContext &BR, - PathSensitiveBugReport &R) { - - const LocationContext *Ctx = N->getLocationContext(); - const StackFrameContext *SCtx = Ctx->getStackFrame(); - ProgramStateRef State = N->getState(); - auto CallExitLoc = N->getLocationAs(); - - // No diagnostic if region was modified inside the frame. - if (!CallExitLoc || isRegionOfInterestModifiedInFrame(N)) - return nullptr; - - CallEventRef<> Call = - BR.getStateManager().getCallEventManager().getCaller(SCtx, State); - - // Region of interest corresponds to an IVar, exiting a method - // which could have written into that IVar, but did not. - if (const auto *MC = dyn_cast(Call)) { - if (const auto *IvarR = dyn_cast(RegionOfInterest)) { - const MemRegion *SelfRegion = MC->getReceiverSVal().getAsRegion(); - if (RegionOfInterest->isSubRegionOf(SelfRegion) && - potentiallyWritesIntoIvar(Call->getRuntimeDefinition().getDecl(), - IvarR->getDecl())) - return maybeEmitNote(R, *Call, N, {}, SelfRegion, "self", - /*FirstIsReferenceType=*/false, 1); - } +NoStoreFuncVisitor::maybeEmitNoteForObjCSelf(PathSensitiveBugReport &R, + const ObjCMethodCall &Call, + const ExplodedNode *N) { + if (const auto *IvarR = dyn_cast(RegionOfInterest)) { + const MemRegion *SelfRegion = Call.getReceiverSVal().getAsRegion(); + if (RegionOfInterest->isSubRegionOf(SelfRegion) && + potentiallyWritesIntoIvar(Call.getRuntimeDefinition().getDecl(), + IvarR->getDecl())) + return maybeEmitNote(R, Call, N, {}, SelfRegion, "self", + /*FirstIsReferenceType=*/false, 1); } + return nullptr; +} - if (const auto *CCall = dyn_cast(Call)) { - const MemRegion *ThisR = CCall->getCXXThisVal().getAsRegion(); - if (RegionOfInterest->isSubRegionOf(ThisR) && - !CCall->getDecl()->isImplicit()) - return maybeEmitNote(R, *Call, N, {}, ThisR, "this", - /*FirstIsReferenceType=*/false, 1); +PathDiagnosticPieceRef +NoStoreFuncVisitor::maybeEmitNoteForCXXThis(PathSensitiveBugReport &R, + const CXXConstructorCall &Call, + const ExplodedNode *N) { + const MemRegion *ThisR = Call.getCXXThisVal().getAsRegion(); + if (RegionOfInterest->isSubRegionOf(ThisR) && !Call.getDecl()->isImplicit()) + return maybeEmitNote(R, Call, N, {}, ThisR, "this", + /*FirstIsReferenceType=*/false, 1); + + // Do not generate diagnostics for not modified parameters in + // constructors. + return nullptr; +} - // Do not generate diagnostics for not modified parameters in - // constructors. - return nullptr; - } +/// \return whether \p Ty points to a const type, or is a const reference. +static bool isPointerToConst(QualType Ty) { + return !Ty->getPointeeType().isNull() && + Ty->getPointeeType().getCanonicalType().isConstQualified(); +} - ArrayRef parameters = getCallParameters(Call); - for (unsigned I = 0; I < Call->getNumArgs() && I < parameters.size(); ++I) { - const ParmVarDecl *PVD = parameters[I]; - SVal V = Call->getArgSVal(I); +PathDiagnosticPieceRef NoStoreFuncVisitor::maybeEmitNoteForParameters( + PathSensitiveBugReport &R, const CallEvent &Call, const ExplodedNode *N) { + ArrayRef Parameters = Call.parameters(); + for (unsigned I = 0; I < Call.getNumArgs() && I < Parameters.size(); ++I) { + const ParmVarDecl *PVD = Parameters[I]; + SVal V = Call.getArgSVal(I); bool ParamIsReferenceType = PVD->getType()->isReferenceType(); std::string ParamName = PVD->getNameAsString(); - int IndirectionLevel = 1; + unsigned IndirectionLevel = 1; QualType T = PVD->getType(); while (const MemRegion *MR = V.getAsRegion()) { if (RegionOfInterest->isSubRegionOf(MR) && !isPointerToConst(T)) - return maybeEmitNote(R, *Call, N, {}, MR, ParamName, + return maybeEmitNote(R, Call, N, {}, MR, ParamName, ParamIsReferenceType, IndirectionLevel); QualType PT = T->getPointeeType(); if (PT.isNull() || PT->isVoidType()) break; + ProgramStateRef State = N->getState(); + if (const RecordDecl *RD = PT->getAsRecordDecl()) if (Optional P = findRegionOfInterestInRecord(RD, State, MR)) - return maybeEmitNote(R, *Call, N, *P, RegionOfInterest, ParamName, + return maybeEmitNote(R, Call, N, *P, RegionOfInterest, ParamName, ParamIsReferenceType, IndirectionLevel); V = State->getSVal(MR, PT); @@ -637,40 +709,11 @@ NoStoreFuncVisitor::VisitNode(const ExplodedNode *N, BugReporterContext &BR, return nullptr; } -void NoStoreFuncVisitor::findModifyingFrames(const ExplodedNode *N) { - assert(N->getLocationAs()); - ProgramStateRef LastReturnState = N->getState(); - SVal ValueAtReturn = LastReturnState->getSVal(RegionOfInterest); - const LocationContext *Ctx = N->getLocationContext(); - const StackFrameContext *OriginalSCtx = Ctx->getStackFrame(); - - do { - ProgramStateRef State = N->getState(); - auto CallExitLoc = N->getLocationAs(); - if (CallExitLoc) { - LastReturnState = State; - ValueAtReturn = LastReturnState->getSVal(RegionOfInterest); - } - - FramesModifyingCalculated.insert(N->getLocationContext()->getStackFrame()); - - if (wasRegionOfInterestModifiedAt(RegionOfInterest, N, ValueAtReturn)) { - const StackFrameContext *SCtx = N->getStackFrame(); - while (!SCtx->inTopFrame()) { - auto p = FramesModifyingRegion.insert(SCtx); - if (!p.second) - break; // Frame and all its parents already inserted. - SCtx = SCtx->getParent()->getStackFrame(); - } - } - - // Stop calculation at the call to the current function. - if (auto CE = N->getLocationAs()) - if (CE->getCalleeContext() == OriginalSCtx) - break; - - N = N->getFirstPred(); - } while (N); +bool NoStoreFuncVisitor::wasModifiedBeforeCallExit( + const ExplodedNode *CurrN, const ExplodedNode *CallExitBeginN) { + return ::wasRegionOfInterestModifiedAt( + RegionOfInterest, CurrN, + CallExitBeginN->getState()->getSVal(RegionOfInterest)); } static llvm::StringLiteral WillBeUsedForACondition = @@ -681,27 +724,6 @@ PathDiagnosticPieceRef NoStoreFuncVisitor::maybeEmitNote( const RegionVector &FieldChain, const MemRegion *MatchedRegion, StringRef FirstElement, bool FirstIsReferenceType, unsigned IndirectionLevel) { - // Optimistically suppress uninitialized value bugs that result - // from system headers having a chance to initialize the value - // but failing to do so. It's too unlikely a system header's fault. - // It's much more likely a situation in which the function has a failure - // mode that the user decided not to check. If we want to hunt such - // omitted checks, we should provide an explicit function-specific note - // describing the precondition under which the function isn't supposed to - // initialize its out-parameter, and additionally check that such - // precondition can actually be fulfilled on the current path. - if (Call.isInSystemHeader()) { - // We make an exception for system header functions that have no branches. - // Such functions unconditionally fail to initialize the variable. - // If they call other functions that have more paths within them, - // this suppression would still apply when we visit these inner functions. - // One common example of a standard function that doesn't ever initialize - // its out parameter is operator placement new; it's up to the follow-up - // constructor (if any) to initialize the memory. - if (!N->getStackFrame()->getCFG()->isLinear()) - R.markInvalid(getTag(), nullptr); - return nullptr; - } PathDiagnosticLocation L = PathDiagnosticLocation::create(N->getLocation(), SM); @@ -717,8 +739,8 @@ PathDiagnosticPieceRef NoStoreFuncVisitor::maybeEmitNote( os << "Returning without writing to '"; // Do not generate the note if failed to pretty-print. - if (!prettyPrintRegionName(FirstElement, FirstIsReferenceType, MatchedRegion, - FieldChain, IndirectionLevel, os)) + if (!prettyPrintRegionName(FieldChain, MatchedRegion, FirstElement, + FirstIsReferenceType, IndirectionLevel, os)) return nullptr; os << "'"; @@ -727,11 +749,11 @@ PathDiagnosticPieceRef NoStoreFuncVisitor::maybeEmitNote( return std::make_shared(L, os.str()); } -bool NoStoreFuncVisitor::prettyPrintRegionName(StringRef FirstElement, - bool FirstIsReferenceType, +bool NoStoreFuncVisitor::prettyPrintRegionName(const RegionVector &FieldChain, const MemRegion *MatchedRegion, - const RegionVector &FieldChain, - int IndirectionLevel, + StringRef FirstElement, + bool FirstIsReferenceType, + unsigned IndirectionLevel, llvm::raw_svector_ostream &os) { if (FirstIsReferenceType) From d3fdbda6b06884fe5c5b80f0099777d84e9287ff Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Mon, 16 Aug 2021 15:52:24 +0200 Subject: [PATCH 109/700] [Polly][Isl] Move to the new-polly-generator branch version of isl-noexceptions.h. NFCI This is part of an effort to reduce the differences between the custom C++ bindings used right now by polly in `lib/External/isl/include/isl/isl-noxceptions.h` and the official isl C++ interface. With this commit we are moving from the `polly-generator` branch to the `new-polly-generator` branch that is more mantainable and is based on the official C++ interface `cpp-checked.h`. Changes made: - There are now many sublcasses for `isl::ast_node` representing different isl types. Use `isl::ast_node_for`, `isl::ast_node_user`, `isl::ast_node_block` and `isl::ast_node_mark` where needed. - There are now many sublcasses for `isl::schedule_node` representing different isl types. Use `isl::schedule_node_mark`, `isl::schedule_node_extension`, `isl::schedule_node_band` and `isl::schedule_node_filter` where needed. - Replace the `isl::*::dump` with `dumpIslObj` since the isl dump method is not exposed in the C++ interface. - `isl::schedule_node::get_child` has been renamed to `isl::schedule_node::child` - `isl::pw_multi_aff::get_pw_aff` has been renamed to `isl::pw_multi_aff::at` - The constructor `isl::union_map(isl::union_pw_multi_aff)` has been replaced with the static method `isl::union_map::from()` - Replace usages of `isl::val::add_ui` with `isl::val::add` - `isl::union_set_list::alloc` is now a constructor - All the `isl_size` values are now wrapped inside the class `isl::size` use `isl::size::release` to get the internal `isl_size` value where needed. - `isl-noexceptions.h` has been generated by https://github.com/patacca/isl/commit/73f5ed1f4d1f72582f731590ef9e43d9ab1956ad No functional change intended. Reviewed By: Meinersbur Differential Revision: https://reviews.llvm.org/D107225 --- polly/include/polly/CodeGen/IslNodeBuilder.h | 7 +- polly/include/polly/ScheduleTreeTransform.h | 2 +- polly/include/polly/Support/GICHelper.h | 45 + polly/include/polly/Support/ISLTools.h | 2 +- polly/lib/Analysis/DependenceInfo.cpp | 4 +- polly/lib/Analysis/ScopBuilder.cpp | 51 +- polly/lib/Analysis/ScopInfo.cpp | 27 +- polly/lib/CodeGen/BlockGenerators.cpp | 5 +- polly/lib/CodeGen/IslAst.cpp | 4 +- polly/lib/CodeGen/IslNodeBuilder.cpp | 49 +- polly/lib/CodeGen/PPCGCodeGeneration.cpp | 18 +- polly/lib/Exchange/JSONExporter.cpp | 4 +- .../isl/include/isl/isl-noexceptions.h | 23537 +++++++++------- polly/lib/Support/GICHelper.cpp | 82 +- polly/lib/Support/ISLTools.cpp | 33 +- polly/lib/Transform/FlattenAlgo.cpp | 28 +- polly/lib/Transform/MatmulOptimizer.cpp | 38 +- .../lib/Transform/MaximalStaticExpansion.cpp | 8 +- polly/lib/Transform/ScheduleOptimizer.cpp | 22 +- polly/lib/Transform/ScheduleTreeTransform.cpp | 62 +- polly/lib/Transform/Simplify.cpp | 2 +- polly/lib/Transform/ZoneAlgo.cpp | 9 +- polly/unittests/Support/ISLTools.cpp | 2 +- 23 files changed, 13041 insertions(+), 11000 deletions(-) diff --git a/polly/include/polly/CodeGen/IslNodeBuilder.h b/polly/include/polly/CodeGen/IslNodeBuilder.h index bb729b8611473..ee0a1e58ae864 100644 --- a/polly/include/polly/CodeGen/IslNodeBuilder.h +++ b/polly/include/polly/CodeGen/IslNodeBuilder.h @@ -217,7 +217,8 @@ class IslNodeBuilder { // of loop iterations. // // 3. With the existing code, upper bounds have been easier to implement. - isl::ast_expr getUpperBound(isl::ast_node For, CmpInst::Predicate &Predicate); + isl::ast_expr getUpperBound(isl::ast_node_for For, + CmpInst::Predicate &Predicate); /// Return non-negative number of iterations in case of the following form /// of a loop and -1 otherwise. @@ -228,7 +229,7 @@ class IslNodeBuilder { /// /// NumIter is a non-negative integer value. Condition can have /// isl_ast_op_lt type. - int getNumberOfIterations(isl::ast_node For); + int getNumberOfIterations(isl::ast_node_for For); /// Compute the values and loops referenced in this subtree. /// @@ -317,7 +318,7 @@ class IslNodeBuilder { bool preloadInvariantEquivClass(InvariantEquivClassTy &IAClass); void createForVector(__isl_take isl_ast_node *For, int VectorWidth); - void createForSequential(isl::ast_node For, bool MarkParallel); + void createForSequential(isl::ast_node_for For, bool MarkParallel); /// Create LLVM-IR that executes a for node thread parallel. /// diff --git a/polly/include/polly/ScheduleTreeTransform.h b/polly/include/polly/ScheduleTreeTransform.h index 8727414c0bc3a..bdf4d77565207 100644 --- a/polly/include/polly/ScheduleTreeTransform.h +++ b/polly/include/polly/ScheduleTreeTransform.h @@ -134,7 +134,7 @@ struct RecursiveScheduleTreeVisitor /// By default, recursively visit the child nodes. RetTy visitNode(const isl::schedule_node &Node, Args... args) { - isl_size NumChildren = Node.n_children(); + isl_size NumChildren = Node.n_children().release(); for (isl_size i = 0; i < NumChildren; i += 1) getDerived().visit(Node.child(i), std::forward(args)...); return RetTy(); diff --git a/polly/include/polly/Support/GICHelper.h b/polly/include/polly/Support/GICHelper.h index 59f1f891c9488..5c6a3256c1c36 100644 --- a/polly/include/polly/Support/GICHelper.h +++ b/polly/include/polly/Support/GICHelper.h @@ -186,6 +186,51 @@ ISL_OBJECT_TO_STRING(union_pw_aff) ISL_OBJECT_TO_STRING(union_pw_multi_aff) //@} +/// C++ wrapper for isl_*_dump() functions. +//@{ +#define ISL_DUMP_OBJECT(name) \ + inline void dumpIslObj(const isl::name &Obj) { isl_##name##_dump(Obj.get()); } + +ISL_DUMP_OBJECT(aff) +ISL_DUMP_OBJECT(aff_list) +ISL_DUMP_OBJECT(ast_expr) +ISL_DUMP_OBJECT(ast_node) +ISL_DUMP_OBJECT(ast_node_list) +ISL_DUMP_OBJECT(basic_map) +ISL_DUMP_OBJECT(basic_map_list) +ISL_DUMP_OBJECT(basic_set) +ISL_DUMP_OBJECT(basic_set_list) +ISL_DUMP_OBJECT(constraint) +ISL_DUMP_OBJECT(id) +ISL_DUMP_OBJECT(id_list) +ISL_DUMP_OBJECT(id_to_ast_expr) +ISL_DUMP_OBJECT(local_space) +ISL_DUMP_OBJECT(map) +ISL_DUMP_OBJECT(map_list) +ISL_DUMP_OBJECT(multi_aff) +ISL_DUMP_OBJECT(multi_pw_aff) +ISL_DUMP_OBJECT(multi_union_pw_aff) +ISL_DUMP_OBJECT(multi_val) +ISL_DUMP_OBJECT(point) +ISL_DUMP_OBJECT(pw_aff) +ISL_DUMP_OBJECT(pw_aff_list) +ISL_DUMP_OBJECT(pw_multi_aff) +ISL_DUMP_OBJECT(schedule) +ISL_DUMP_OBJECT(schedule_constraints) +ISL_DUMP_OBJECT(schedule_node) +ISL_DUMP_OBJECT(set) +ISL_DUMP_OBJECT(set_list) +ISL_DUMP_OBJECT(space) +ISL_DUMP_OBJECT(union_map) +ISL_DUMP_OBJECT(union_pw_aff) +ISL_DUMP_OBJECT(union_pw_aff_list) +ISL_DUMP_OBJECT(union_pw_multi_aff) +ISL_DUMP_OBJECT(union_set) +ISL_DUMP_OBJECT(union_set_list) +ISL_DUMP_OBJECT(val) +ISL_DUMP_OBJECT(val_list) +//@} + inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, __isl_keep isl_union_map *Map) { OS << polly::stringFromIslObj(Map, "null"); diff --git a/polly/include/polly/Support/ISLTools.h b/polly/include/polly/Support/ISLTools.h index 8172389b7da0a..c4e62ca312852 100644 --- a/polly/include/polly/Support/ISLTools.h +++ b/polly/include/polly/Support/ISLTools.h @@ -32,7 +32,7 @@ struct isl_iterator using ElementT = list_element_type; explicit isl_iterator(const ListT &List) - : List(&List), Position(std::max(List.size(), 0)) {} + : List(&List), Position(std::max(List.size().release(), 0)) {} isl_iterator(const ListT &List, int Position) : List(&List), Position(Position) {} diff --git a/polly/lib/Analysis/DependenceInfo.cpp b/polly/lib/Analysis/DependenceInfo.cpp index f00507ad3a48d..709bce7ea3b60 100644 --- a/polly/lib/Analysis/DependenceInfo.cpp +++ b/polly/lib/Analysis/DependenceInfo.cpp @@ -190,7 +190,7 @@ static void collectInfo(Scop &S, isl_union_map *&Read, /// Fix all dimension of @p Zero to 0 and add it to @p user static void fixSetToZero(isl::set Zero, isl::union_set *User) { - for (auto i : seq(0, Zero.tuple_dim())) + for (auto i : seq(0, Zero.tuple_dim().release())) Zero = Zero.fix_si(isl::dim::set, i, 0); *User = User->unite(Zero); } @@ -667,7 +667,7 @@ bool Dependences::isValidSchedule( Dependences = Dependences.apply_range(Schedule); isl::set Zero = isl::set::universe(ScheduleSpace); - for (auto i : seq(0, Zero.tuple_dim())) + for (auto i : seq(0, Zero.tuple_dim().release())) Zero = Zero.fix_si(isl::dim::set, i, 0); isl::union_set UDeltas = Dependences.deltas(); diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp index 1aab9e4935284..4351d27f193ce 100644 --- a/polly/lib/Analysis/ScopBuilder.cpp +++ b/polly/lib/Analysis/ScopBuilder.cpp @@ -202,7 +202,7 @@ static bool containsErrorBlock(RegionNode *RN, const Region &R, LoopInfo &LI, static isl::map createNextIterationMap(isl::space SetSpace, unsigned Dim) { isl::space MapSpace = SetSpace.map_from_set(); isl::map NextIterationMap = isl::map::universe(MapSpace); - for (auto u : seq(0, NextIterationMap.domain_tuple_dim())) + for (auto u : seq(0, NextIterationMap.domain_tuple_dim().release())) if (u != (isl_size)Dim) NextIterationMap = NextIterationMap.equate(isl::dim::in, u, isl::dim::out, u); @@ -230,10 +230,10 @@ static isl::set collectBoundedParts(isl::set S) { /// both with regards to the dimension @p Dim. static std::pair partitionSetParts(isl::set S, unsigned Dim) { - for (unsigned u = 0, e = S.tuple_dim(); u < e; u++) + for (unsigned u = 0, e = S.tuple_dim().release(); u < e; u++) S = S.lower_bound_si(isl::dim::set, u, 0); - unsigned NumDimsS = S.tuple_dim(); + unsigned NumDimsS = S.tuple_dim().release(); isl::set OnlyDimS = S; // Remove dimensions that are greater than Dim as they are not interesting. @@ -328,7 +328,7 @@ isl::set ScopBuilder::adjustDomainDimensions(isl::set Dom, Loop *OldL, } else { assert(OldDepth > NewDepth); int Diff = OldDepth - NewDepth; - int NumDim = Dom.tuple_dim(); + int NumDim = Dom.tuple_dim().release(); assert(NumDim >= Diff); Dom = Dom.project_out(isl::dim::set, NumDim - Diff, Diff); } @@ -838,7 +838,7 @@ bool ScopBuilder::buildDomains( isl_set_universe(isl_space_set_alloc(scop->getIslCtx().get(), 0, LD + 1)); InvalidDomainMap[EntryBB] = isl::manage(isl_set_empty(isl_set_get_space(S))); - isl::noexceptions::set Domain = isl::manage(S); + isl::set Domain = isl::manage(S); scop->setDomain(EntryBB, Domain); if (IsOnlyNonAffineRegion) @@ -909,7 +909,7 @@ bool ScopBuilder::buildDomainsWithBranchConstraints( continue; isl::set Domain = scop->getDomainConditions(BB); - scop->updateMaxLoopDepth(Domain.tuple_dim()); + scop->updateMaxLoopDepth(Domain.tuple_dim().release()); auto *BBLoop = getRegionNodeLoop(RN, LI); // Propagate the domain from BB directly to blocks that have a superset @@ -983,7 +983,7 @@ bool ScopBuilder::buildDomainsWithBranchConstraints( // Check if the maximal number of domain disjunctions was reached. // In case this happens we will clean up and bail. - if (SuccDomain.n_basic_set() < MaxDisjunctsInDomain) + if (SuccDomain.n_basic_set().release() < MaxDisjunctsInDomain) continue; scop->invalidate(COMPLEXITY, DebugLoc()); @@ -1063,7 +1063,7 @@ bool ScopBuilder::propagateInvalidStmtDomains( // Check if the maximal number of domain disjunctions was reached. // In case this happens we will bail. - if (SuccInvalidDomain.n_basic_set() < MaxDisjunctsInDomain) + if (SuccInvalidDomain.n_basic_set().release() < MaxDisjunctsInDomain) continue; InvalidDomainMap.erase(BB); @@ -1162,7 +1162,7 @@ static isl::multi_union_pw_aff mapToDimension(isl::union_set USet, int N) { auto Result = isl::union_pw_multi_aff::empty(USet.get_space()); for (isl::set S : USet.get_set_list()) { - int Dim = S.tuple_dim(); + int Dim = S.tuple_dim().release(); auto PMA = isl::pw_multi_aff::project_out_map(S.get_space(), isl::dim::set, N, Dim - N); if (N > 1) @@ -1307,10 +1307,8 @@ void ScopBuilder::buildSchedule(RegionNode *RN, LoopStackTy &LoopStack) { // It is easier to insert the marks here that do it retroactively. isl::id IslLoopId = createIslLoopAttr(scop->getIslCtx(), L); if (!IslLoopId.is_null()) - Schedule = Schedule.get_root() - .get_child(0) - .insert_mark(IslLoopId) - .get_schedule(); + Schedule = + Schedule.get_root().child(0).insert_mark(IslLoopId).get_schedule(); LoopData->Schedule = combineInSequence(LoopData->Schedule, Schedule); } @@ -2405,7 +2403,7 @@ void ScopBuilder::foldSizeConstantsToRight() { isl::map Transform = isl::map::universe(Array->getSpace().map_from_set()); std::vector Int; - int Dims = Elements.tuple_dim(); + int Dims = Elements.tuple_dim().release(); for (int i = 0; i < Dims; i++) { isl::set DimOnly = isl::set(Elements).project_out(isl::dim::set, 0, i); DimOnly = DimOnly.project_out(isl::dim::set, 1, Dims - i - 1); @@ -2419,7 +2417,7 @@ void ScopBuilder::foldSizeConstantsToRight() { continue; } - if (DimHull.dim(isl::dim::div) == 1) { + if (DimHull.dim(isl::dim::div).release() == 1) { isl::aff Diff = DimHull.get_div(0); isl::val Val = Diff.get_denominator_val(); @@ -2839,8 +2837,8 @@ static bool isAccessRangeTooComplex(isl::set AccessRange) { int NumTotalDims = 0; for (isl::basic_set BSet : AccessRange.get_basic_set_list()) { - NumTotalDims += BSet.dim(isl::dim::div); - NumTotalDims += BSet.dim(isl::dim::set); + NumTotalDims += BSet.dim(isl::dim::div).release(); + NumTotalDims += BSet.dim(isl::dim::set).release(); } if (NumTotalDims > MaxDimensionsInAccessRange) @@ -2869,7 +2867,8 @@ void ScopBuilder::addUserContext() { isl::set UserContext = isl::set(scop->getIslCtx(), UserContextStr.c_str()); isl::space Space = scop->getParamSpace(); - if (Space.dim(isl::dim::param) != UserContext.dim(isl::dim::param)) { + if (Space.dim(isl::dim::param).release() != + UserContext.dim(isl::dim::param).release()) { std::string SpaceStr = stringFromIslObj(Space, "null"); errs() << "Error: the context provided in -polly-context has not the same " << "number of dimensions than the computed context. Due to this " @@ -2878,7 +2877,7 @@ void ScopBuilder::addUserContext() { return; } - for (auto i : seq(0, Space.dim(isl::dim::param))) { + for (auto i : seq(0, Space.dim(isl::dim::param).release())) { std::string NameContext = scop->getContext().get_dim_name(isl::dim::param, i); std::string NameUserContext = UserContext.get_dim_name(isl::dim::param, i); @@ -2962,7 +2961,7 @@ isl::set ScopBuilder::getNonHoistableCtx(MemoryAccess *Access, return WrittenCtx; WrittenCtx = WrittenCtx.remove_divs(); - bool TooComplex = WrittenCtx.n_basic_set() >= MaxDisjunctsInDomain; + bool TooComplex = WrittenCtx.n_basic_set().release() >= MaxDisjunctsInDomain; if (TooComplex || !isRequiredInvariantLoad(LI)) return {}; @@ -3028,7 +3027,7 @@ void ScopBuilder::addInvariantLoads(ScopStmt &Stmt, isl::set DomainCtx = Stmt.getDomain().params(); DomainCtx = DomainCtx.subtract(StmtInvalidCtx); - if (DomainCtx.n_basic_set() >= MaxDisjunctsInDomain) { + if (DomainCtx.n_basic_set().release() >= MaxDisjunctsInDomain) { auto *AccInst = InvMAs.front().MA->getAccessInstruction(); scop->invalidate(COMPLEXITY, AccInst->getDebugLoc(), AccInst->getParent()); return; @@ -3304,7 +3303,7 @@ static bool buildMinMaxAccess(isl::set Set, Set = Set.remove_divs(); polly::simplify(Set); - if (Set.n_basic_set() > RunTimeChecksMaxAccessDisjuncts) + if (Set.n_basic_set().release() > RunTimeChecksMaxAccessDisjuncts) Set = Set.simple_hull(); // Restrict the number of parameters involved in the access as the lexmin/ @@ -3342,11 +3341,11 @@ static bool buildMinMaxAccess(isl::set Set, // enclose the accessed memory region by MinPMA and MaxPMA. The pointer // we test during code generation might now point after the end of the // allocated array but we will never dereference it anyway. - assert((MaxPMA.is_null() || MaxPMA.dim(isl::dim::out)) && + assert((MaxPMA.is_null() || MaxPMA.dim(isl::dim::out).release()) && "Assumed at least one output dimension"); - Pos = MaxPMA.dim(isl::dim::out) - 1; - LastDimAff = MaxPMA.get_pw_aff(Pos); + Pos = MaxPMA.dim(isl::dim::out).release() - 1; + LastDimAff = MaxPMA.at(Pos); OneAff = isl::aff(isl::local_space(LastDimAff.get_domain_space())); OneAff = OneAff.add_constant_si(1); LastDimAff = LastDimAff.add(OneAff); @@ -3386,7 +3385,7 @@ bool ScopBuilder::calculateMinMaxAccess(AliasGroupTy AliasGroup, static isl::set getAccessDomain(MemoryAccess *MA) { isl::set Domain = MA->getStatement()->getDomain(); - Domain = Domain.project_out(isl::dim::set, 0, Domain.tuple_dim()); + Domain = Domain.project_out(isl::dim::set, 0, Domain.tuple_dim().release()); return Domain.reset_tuple_id(); } diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp index 8d11b503018f4..965776b8b3457 100644 --- a/polly/lib/Analysis/ScopInfo.cpp +++ b/polly/lib/Analysis/ScopInfo.cpp @@ -185,7 +185,7 @@ static isl::set addRangeBoundsToSet(isl::set S, const ConstantRange &Range, if (Range.isFullSet()) return S; - if (S.n_basic_set() > MaxDisjunctsInContext) + if (S.n_basic_set().release() > MaxDisjunctsInContext) return S; // In case of signed wrapping, we can refine the set of valid values by @@ -473,8 +473,8 @@ void MemoryAccess::updateDimensionality() { isl::space AccessSpace = AccessRelation.get_space().range(); isl::ctx Ctx = ArraySpace.ctx(); - auto DimsArray = ArraySpace.dim(isl::dim::set); - auto DimsAccess = AccessSpace.dim(isl::dim::set); + auto DimsArray = ArraySpace.dim(isl::dim::set).release(); + auto DimsAccess = AccessSpace.dim(isl::dim::set).release(); auto DimsMissing = DimsArray - DimsAccess; auto *BB = getStatement()->getEntryBlock(); @@ -671,14 +671,14 @@ isl::set MemoryAccess::assumeNoOutOfBound() { auto *SAI = getScopArrayInfo(); isl::space Space = getOriginalAccessRelationSpace().range(); isl::set Outside = isl::set::empty(Space); - for (int i = 1, Size = Space.dim(isl::dim::set); i < Size; ++i) { + for (int i = 1, Size = Space.dim(isl::dim::set).release(); i < Size; ++i) { isl::local_space LS(Space); isl::pw_aff Var = isl::pw_aff::var_on_domain(LS, isl::dim::set, i); isl::pw_aff Zero = isl::pw_aff(LS); isl::set DimOutside = Var.lt_set(Zero); isl::pw_aff SizeE = SAI->getDimensionSizePw(i); - SizeE = SizeE.add_dims(isl::dim::in, Space.dim(isl::dim::set)); + SizeE = SizeE.add_dims(isl::dim::in, Space.dim(isl::dim::set).release()); SizeE = SizeE.set_tuple_id(isl::dim::in, Space.get_tuple_id(isl::dim::set)); DimOutside = DimOutside.unite(SizeE.le_set(Var)); @@ -830,8 +830,8 @@ void MemoryAccess::foldAccessRelation() { // Access dimension folding might in certain cases increase the number of // disjuncts in the memory access, which can possibly complicate the generated // run-time checks and can lead to costly compilation. - if (!PollyPreciseFoldAccesses && - NewAccessRelation.n_basic_map() > AccessRelation.n_basic_map()) { + if (!PollyPreciseFoldAccesses && NewAccessRelation.n_basic_map().release() > + AccessRelation.n_basic_map().release()) { } else { AccessRelation = NewAccessRelation; } @@ -1006,7 +1006,7 @@ isl::pw_aff MemoryAccess::getPwAff(const SCEV *E) { static isl::map getEqualAndLarger(isl::space SetDomain) { isl::space Space = SetDomain.map_from_set(); isl::map Map = isl::map::universe(Space); - unsigned lastDimension = Map.domain_tuple_dim() - 1; + unsigned lastDimension = Map.domain_tuple_dim().release() - 1; // Set all but the last dimension to be equal for the input and output // @@ -1046,9 +1046,10 @@ bool MemoryAccess::isStrideX(isl::map Schedule, int StrideWidth) const { Stride = getStride(Schedule); StrideX = isl::set::universe(Stride.get_space()); - for (auto i : seq(0, StrideX.tuple_dim() - 1)) + for (auto i : seq(0, StrideX.tuple_dim().release() - 1)) StrideX = StrideX.fix_si(isl::dim::set, i, 0); - StrideX = StrideX.fix_si(isl::dim::set, StrideX.tuple_dim() - 1, StrideWidth); + StrideX = StrideX.fix_si(isl::dim::set, StrideX.tuple_dim().release() - 1, + StrideWidth); IsStrideX = Stride.is_subset(StrideX); return IsStrideX; @@ -1108,7 +1109,7 @@ void MemoryAccess::setNewAccessRelation(isl::map NewAccess) { // Check whether access dimensions correspond to number of dimensions of the // accesses array. isl_size Dims = SAI->getNumberOfDimensions(); - assert(NewAccessSpace.dim(isl::dim::set) == Dims && + assert(NewAccessSpace.dim(isl::dim::set).release() == Dims && "Access dims must match array dims"); #endif @@ -2143,10 +2144,10 @@ void Scop::intersectDefinedBehavior(isl::set Set, AssumptionSign Sign) { // Limit the complexity of the context. If complexity is exceeded, simplify // the set and check again. - if (DefinedBehaviorContext.n_basic_set() > + if (DefinedBehaviorContext.n_basic_set().release() > MaxDisjunktsInDefinedBehaviourContext) { simplify(DefinedBehaviorContext); - if (DefinedBehaviorContext.n_basic_set() > + if (DefinedBehaviorContext.n_basic_set().release() > MaxDisjunktsInDefinedBehaviourContext) DefinedBehaviorContext = {}; } diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp index f8cc47ca5a054..9297a51ef23c2 100644 --- a/polly/lib/CodeGen/BlockGenerators.cpp +++ b/polly/lib/CodeGen/BlockGenerators.cpp @@ -688,13 +688,12 @@ void BlockGenerator::generateBeginStmtTrace(ScopStmt &Stmt, LoopToScevMapT <S, Values.push_back(RuntimeDebugBuilder::getPrintableString(Builder, "(")); // Add the coordinate of the statement instance. - int DomDims = ScheduleMultiPwAff.dim(isl::dim::out); + int DomDims = ScheduleMultiPwAff.dim(isl::dim::out).release(); for (int i = 0; i < DomDims; i += 1) { if (i > 0) Values.push_back(RuntimeDebugBuilder::getPrintableString(Builder, ",")); - isl::ast_expr IsInSet = - RestrictedBuild.expr_from(ScheduleMultiPwAff.get_pw_aff(i)); + isl::ast_expr IsInSet = RestrictedBuild.expr_from(ScheduleMultiPwAff.at(i)); Values.push_back(ExprBuilder->create(IsInSet.copy())); } diff --git a/polly/lib/CodeGen/IslAst.cpp b/polly/lib/CodeGen/IslAst.cpp index cc9112b754149..ab0bcdaf8ef5f 100644 --- a/polly/lib/CodeGen/IslAst.cpp +++ b/polly/lib/CodeGen/IslAst.cpp @@ -678,8 +678,8 @@ static __isl_give isl_printer *cbPrintUser(__isl_take isl_printer *P, __isl_take isl_ast_print_options *O, __isl_keep isl_ast_node *Node, void *User) { - isl::ast_node AstNode = isl::manage_copy(Node); - isl::ast_expr NodeExpr = AstNode.user_get_expr(); + isl::ast_node_user AstNode = isl::manage_copy(Node).as(); + isl::ast_expr NodeExpr = AstNode.expr(); isl::ast_expr CallExpr = NodeExpr.get_op_arg(0); isl::id CallExprId = CallExpr.get_id(); ScopStmt *AccessStmt = (ScopStmt *)CallExprId.get_user(); diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp index 16a89063499bd..5119b53eafacd 100644 --- a/polly/lib/CodeGen/IslNodeBuilder.cpp +++ b/polly/lib/CodeGen/IslNodeBuilder.cpp @@ -107,10 +107,10 @@ static cl::opt PollyOmpBackend( clEnumValN(OpenMPBackend::LLVM, "LLVM", "LLVM OpenMP")), cl::Hidden, cl::init(OpenMPBackend::GNU), cl::cat(PollyCategory)); -isl::ast_expr IslNodeBuilder::getUpperBound(isl::ast_node For, +isl::ast_expr IslNodeBuilder::getUpperBound(isl::ast_node_for For, ICmpInst::Predicate &Predicate) { - isl::ast_expr Cond = For.for_get_cond(); - isl::ast_expr Iterator = For.for_get_iterator(); + isl::ast_expr Cond = For.cond(); + isl::ast_expr Iterator = For.iterator(); assert(isl_ast_expr_get_type(Cond.get()) == isl_ast_expr_op && "conditional expression is not an atomic upper bound"); @@ -163,16 +163,17 @@ static bool checkIslAstExprInt(__isl_take isl_ast_expr *Expr, return true; } -int IslNodeBuilder::getNumberOfIterations(isl::ast_node For) { +int IslNodeBuilder::getNumberOfIterations(isl::ast_node_for For) { assert(isl_ast_node_get_type(For.get()) == isl_ast_node_for); - isl::ast_node Body = For.for_get_body(); + isl::ast_node Body = For.body(); // First, check if we can actually handle this code. switch (isl_ast_node_get_type(Body.get())) { case isl_ast_node_user: break; case isl_ast_node_block: { - isl::ast_node_list List = Body.block_get_children(); + isl::ast_node_block BodyBlock = Body.as(); + isl::ast_node_list List = BodyBlock.children(); for (isl::ast_node Node : List) { isl_ast_node_type NodeType = isl_ast_node_get_type(Node.get()); if (NodeType != isl_ast_node_user) @@ -184,10 +185,10 @@ int IslNodeBuilder::getNumberOfIterations(isl::ast_node For) { return -1; } - isl::ast_expr Init = For.for_get_init(); + isl::ast_expr Init = For.init(); if (!checkIslAstExprInt(Init.release(), isl_val_is_zero)) return -1; - isl::ast_expr Inc = For.for_get_inc(); + isl::ast_expr Inc = For.inc(); if (!checkIslAstExprInt(Inc.release(), isl_val_is_one)) return -1; CmpInst::Predicate Predicate; @@ -413,11 +414,12 @@ void IslNodeBuilder::createMark(__isl_take isl_ast_node *Node) { if (strcmp(isl_id_get_name(Id), "SIMD") == 0 && isl_ast_node_get_type(Child) == isl_ast_node_for) { bool Vector = PollyVectorizerChoice == VECTORIZER_POLLY; - int VectorWidth = getNumberOfIterations(isl::manage_copy(Child)); + int VectorWidth = + getNumberOfIterations(isl::manage_copy(Child).as()); if (Vector && 1 < VectorWidth && VectorWidth <= 16) createForVector(Child, VectorWidth); else - createForSequential(isl::manage(Child), true); + createForSequential(isl::manage(Child).as(), true); isl_id_free(Id); return; } @@ -518,18 +520,21 @@ void IslNodeBuilder::createForVector(__isl_take isl_ast_node *For, /// /// @param Node The band node to be modified. /// @return The modified schedule node. -static bool IsLoopVectorizerDisabled(isl::ast_node Node) { +static bool IsLoopVectorizerDisabled(isl::ast_node_for Node) { assert(isl_ast_node_get_type(Node.get()) == isl_ast_node_for); - auto Body = Node.for_get_body(); + isl::ast_node Body = Node.body(); if (isl_ast_node_get_type(Body.get()) != isl_ast_node_mark) return false; - auto Id = Body.mark_get_id(); + + isl::ast_node_mark BodyMark = Body.as(); + auto Id = BodyMark.id(); if (strcmp(Id.get_name().c_str(), "Loop Vectorizer Disabled") == 0) return true; return false; } -void IslNodeBuilder::createForSequential(isl::ast_node For, bool MarkParallel) { +void IslNodeBuilder::createForSequential(isl::ast_node_for For, + bool MarkParallel) { Value *ValueLB, *ValueUB, *ValueInc; Type *MaxType; BasicBlock *ExitBlock; @@ -538,7 +543,7 @@ void IslNodeBuilder::createForSequential(isl::ast_node For, bool MarkParallel) { bool LoopVectorizerDisabled = IsLoopVectorizerDisabled(For); - isl::ast_node Body = For.for_get_body(); + isl::ast_node Body = For.body(); // isl_ast_node_for_is_degenerate(For) // @@ -546,9 +551,9 @@ void IslNodeBuilder::createForSequential(isl::ast_node For, bool MarkParallel) { // However, for now we just reuse the logic for normal loops, which will // create a loop with a single iteration. - isl::ast_expr Init = For.for_get_init(); - isl::ast_expr Inc = For.for_get_inc(); - isl::ast_expr Iterator = For.for_get_iterator(); + isl::ast_expr Init = For.init(); + isl::ast_expr Inc = For.inc(); + isl::ast_expr Iterator = For.iterator(); isl::id IteratorID = Iterator.get_id(); isl::ast_expr UB = getUpperBound(For, Predicate); @@ -654,7 +659,8 @@ void IslNodeBuilder::createForParallel(__isl_take isl_ast_node *For) { Inc = isl_ast_node_for_get_inc(For); Iterator = isl_ast_node_for_get_iterator(For); IteratorID = isl_ast_expr_get_id(Iterator); - UB = getUpperBound(isl::manage_copy(For), Predicate).release(); + UB = getUpperBound(isl::manage_copy(For).as(), Predicate) + .release(); ValueLB = ExprBuilder.create(Init); ValueUB = ExprBuilder.create(UB); @@ -782,7 +788,8 @@ void IslNodeBuilder::createFor(__isl_take isl_ast_node *For) { if (Vector && IslAstInfo::isInnermostParallel(isl::manage_copy(For)) && !IslAstInfo::isReductionParallel(isl::manage_copy(For))) { - int VectorWidth = getNumberOfIterations(isl::manage_copy(For)); + int VectorWidth = + getNumberOfIterations(isl::manage_copy(For).as()); if (1 < VectorWidth && VectorWidth <= 16 && !hasPartialAccesses(For)) { createForVector(For, VectorWidth); return; @@ -795,7 +802,7 @@ void IslNodeBuilder::createFor(__isl_take isl_ast_node *For) { } bool Parallel = (IslAstInfo::isParallel(isl::manage_copy(For)) && !IslAstInfo::isReductionParallel(isl::manage_copy(For))); - createForSequential(isl::manage(For), Parallel); + createForSequential(isl::manage(For).as(), Parallel); } void IslNodeBuilder::createIf(__isl_take isl_ast_node *If) { diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp index aaa365cffa94d..1c4e24998f38c 100644 --- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp +++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp @@ -1124,11 +1124,11 @@ Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { if (!gpu_array_is_scalar(Array)) { isl::multi_pw_aff ArrayBound = isl::manage_copy(Array->bound); - isl::pw_aff OffsetDimZero = ArrayBound.get_pw_aff(0); + isl::pw_aff OffsetDimZero = ArrayBound.at(0); isl::ast_expr Res = Build.expr_from(OffsetDimZero); for (unsigned int i = 1; i < Array->n_index; i++) { - isl::pw_aff Bound_I = ArrayBound.get_pw_aff(i); + isl::pw_aff Bound_I = ArrayBound.at(i); isl::ast_expr Expr = Build.expr_from(Bound_I); Res = Res.mul(Expr); } @@ -1151,7 +1151,7 @@ Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) { isl::set ZeroSet = isl::set::universe(Min.get_space()); - for (long i = 0, n = Min.tuple_dim(); i < n; i++) + for (long i = 0, n = Min.tuple_dim().release(); i < n; i++) ZeroSet = ZeroSet.fix_si(isl::dim::set, i, 0); if (Min.is_subset(ZeroSet)) { @@ -1160,7 +1160,7 @@ Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) { isl::ast_expr Result = isl::ast_expr::from_val(isl::val(Min.ctx(), 0)); - for (long i = 0, n = Min.tuple_dim(); i < n; i++) { + for (long i = 0, n = Min.tuple_dim().release(); i < n; i++) { if (i > 0) { isl::pw_aff Bound_I = isl::manage(isl_multi_pw_aff_get_pw_aff(Array->bound, i - 1)); @@ -1307,7 +1307,7 @@ void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { } void GPUNodeBuilder::createFor(__isl_take isl_ast_node *Node) { - createForSequential(isl::manage(Node), false); + createForSequential(isl::manage(Node).as(), false); } void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) { @@ -1596,7 +1596,7 @@ std::tuple GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { isl::multi_pw_aff GridSizePwAffs = isl::manage_copy(Kernel->grid_size); for (long i = 0; i < Kernel->n_grid; i++) { - isl::pw_aff Size = GridSizePwAffs.get_pw_aff(i); + isl::pw_aff Size = GridSizePwAffs.at(i); isl::ast_expr GridSize = Context.expr_from(Size); Value *Res = ExprBuilder.create(GridSize.release()); Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); @@ -2885,8 +2885,8 @@ class PPCGCodeGeneration : public ScopPass { isl::pw_aff Val = isl::aff::var_on_domain(LS, isl::dim::set, 0); isl::pw_aff OuterMin = AccessSet.dim_min(0); isl::pw_aff OuterMax = AccessSet.dim_max(0); - OuterMin = OuterMin.add_dims(isl::dim::in, Val.dim(isl::dim::in)); - OuterMax = OuterMax.add_dims(isl::dim::in, Val.dim(isl::dim::in)); + OuterMin = OuterMin.add_dims(isl::dim::in, Val.dim(isl::dim::in).release()); + OuterMax = OuterMax.add_dims(isl::dim::in, Val.dim(isl::dim::in).release()); OuterMin = OuterMin.set_tuple_id(isl::dim::in, Array->getBasePtrId()); OuterMax = OuterMax.set_tuple_id(isl::dim::in, Array->getBasePtrId()); @@ -2910,7 +2910,7 @@ class PPCGCodeGeneration : public ScopPass { isl::pw_aff Val = isl::aff::var_on_domain( isl::local_space(Array->getSpace()), isl::dim::set, i); - PwAff = PwAff.add_dims(isl::dim::in, Val.dim(isl::dim::in)); + PwAff = PwAff.add_dims(isl::dim::in, Val.dim(isl::dim::in).release()); PwAff = PwAff.set_tuple_id(isl::dim::in, Val.get_tuple_id(isl::dim::in)); isl::set Set = PwAff.gt_set(Val); Extent = Set.intersect(Extent); diff --git a/polly/lib/Exchange/JSONExporter.cpp b/polly/lib/Exchange/JSONExporter.cpp index 5a4e01a8b4b74..4bff2e033bc72 100644 --- a/polly/lib/Exchange/JSONExporter.cpp +++ b/polly/lib/Exchange/JSONExporter.cpp @@ -230,8 +230,8 @@ static bool importContext(Scop &S, const json::Object &JScop) { return false; } - unsigned OldContextDim = OldContext.dim(isl::dim::param); - unsigned NewContextDim = NewContext.dim(isl::dim::param); + unsigned OldContextDim = OldContext.dim(isl::dim::param).release(); + unsigned NewContextDim = NewContext.dim(isl::dim::param).release(); // Check if the imported context has the right number of parameters. if (OldContextDim != NewContextDim) { diff --git a/polly/lib/External/isl/include/isl/isl-noexceptions.h b/polly/lib/External/isl/include/isl/isl-noexceptions.h index 0aae386f0620a..5142941900ce8 100644 --- a/polly/lib/External/isl/include/isl/isl-noexceptions.h +++ b/polly/lib/External/isl/include/isl/isl-noexceptions.h @@ -10,34 +10,17 @@ #ifndef ISL_CPP_CHECKED #define ISL_CPP_CHECKED -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include +#include #include +#include +#include #include +#include namespace isl { -inline namespace noexceptions { #define ISLPP_STRINGIZE_(X) #X #define ISLPP_STRINGIZE(X) ISLPP_STRINGIZE_(X) @@ -52,37 +35,60 @@ inline namespace noexceptions { abort(); \ } while (0) +/* Class used to check that isl::checked::boolean, + * isl::checked::stat and isl::checked::size values are checked for errors. + */ +struct checker { + bool checked = false; + ~checker() { + //ISLPP_ASSERT(checked, "IMPLEMENTATION ERROR: Unchecked state"); + } +}; + class boolean { private: - mutable bool checked = false; + mutable std::shared_ptr check = std::make_shared(); isl_bool val; friend boolean manage(isl_bool val); boolean(isl_bool val): val(val) {} public: + static boolean error() { + return boolean(isl_bool_error); + } boolean() : val(isl_bool_error) {} - ~boolean() { - // ISLPP_ASSERT(checked, "IMPLEMENTATION ERROR: Unchecked state"); - } /* implicit */ boolean(bool val) : val(val ? isl_bool_true : isl_bool_false) {} - bool is_error() const { checked = true; return val == isl_bool_error; } - bool is_false() const { checked = true; return val == isl_bool_false; } - bool is_true() const { checked = true; return val == isl_bool_true; } + isl_bool release() { + auto tmp = val; + val = isl_bool_error; + check->checked = true; + return tmp; + } + + bool is_error() const { check->checked = true; return val == isl_bool_error; } + bool is_false() const { check->checked = true; return val == isl_bool_false; } + bool is_true() const { check->checked = true; return val == isl_bool_true; } operator bool() const { - // ISLPP_ASSERT(checked, "IMPLEMENTATION ERROR: Unchecked error state"); + //ISLPP_ASSERT(check->checked, "IMPLEMENTATION ERROR: Unchecked error state"); ISLPP_ASSERT(!is_error(), "IMPLEMENTATION ERROR: Unhandled error state"); return is_true(); } + boolean negate() { + if (val == isl_bool_true) + val = isl_bool_false; + else if (val == isl_bool_false) + val = isl_bool_true; + return *this; + } + boolean operator!() const { - if (is_error()) - return *this; - return !is_true(); + return boolean(*this).negate(); } }; @@ -109,12 +115,12 @@ class ctx { */ class stat { private: - mutable bool checked = false; + mutable std::shared_ptr check = std::make_shared(); isl_stat val; friend stat manage(isl_stat val); + stat(isl_stat val) : val(val) {} public: - constexpr stat(isl_stat val) : val(val) {} static stat ok() { return stat(isl_stat_ok); } @@ -122,31 +128,65 @@ class stat { return stat(isl_stat_error); } stat() : val(isl_stat_error) {} - ~stat() { - // ISLPP_ASSERT(checked, "IMPLEMENTATION ERROR: Unchecked state"); - } isl_stat release() { - checked = true; + check->checked = true; return val; } bool is_error() const { - checked = true; + check->checked = true; return val == isl_stat_error; } bool is_ok() const { - checked = true; + check->checked = true; return val == isl_stat_ok; } }; - inline stat manage(isl_stat val) { return stat(val); } +/* Class encapsulating an isl_size value. + */ +class size { +private: + mutable std::shared_ptr check = std::make_shared(); + isl_size val; + + friend size manage(isl_size val); + size(isl_size val) : val(val) {} +public: + size() : val(isl_size_error) {} + + isl_size release() { + auto tmp = val; + val = isl_size_error; + check->checked = true; + return tmp; + } + + bool is_error() const { + check->checked = true; + return val == isl_size_error; + } + + explicit operator unsigned() const { + ISLPP_ASSERT(check->checked, + "IMPLEMENTATION ERROR: Unchecked error state"); + ISLPP_ASSERT(!is_error(), + "IMPLEMENTATION ERROR: Unhandled error state"); + return val; + } +}; + +inline size manage(isl_size val) +{ + return size(val); +} + enum class dim { cst = isl_dim_cst, param = isl_dim_param, @@ -157,27 +197,72 @@ enum class dim { all = isl_dim_all }; -} } // namespace isl -namespace isl { +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -inline namespace noexceptions { +namespace isl { // forward declarations class aff; class aff_list; class ast_build; class ast_expr; -class ast_expr_list; +class ast_expr_id; +class ast_expr_int; +class ast_expr_op; +class ast_expr_op_access; +class ast_expr_op_add; +class ast_expr_op_address_of; +class ast_expr_op_and; +class ast_expr_op_and_then; +class ast_expr_op_call; +class ast_expr_op_cond; +class ast_expr_op_div; +class ast_expr_op_eq; +class ast_expr_op_fdiv_q; +class ast_expr_op_ge; +class ast_expr_op_gt; +class ast_expr_op_le; +class ast_expr_op_lt; +class ast_expr_op_max; +class ast_expr_op_member; +class ast_expr_op_min; +class ast_expr_op_minus; +class ast_expr_op_mul; +class ast_expr_op_or; +class ast_expr_op_or_else; +class ast_expr_op_pdiv_q; +class ast_expr_op_pdiv_r; +class ast_expr_op_select; +class ast_expr_op_sub; +class ast_expr_op_zdiv_r; class ast_node; +class ast_node_block; +class ast_node_for; +class ast_node_if; class ast_node_list; +class ast_node_mark; +class ast_node_user; class basic_map; class basic_map_list; class basic_set; class basic_set_list; class constraint; -class constraint_list; class fixed_box; class id; class id_list; @@ -185,7 +270,6 @@ class id_to_ast_expr; class local_space; class map; class map_list; -class mat; class multi_aff; class multi_id; class multi_pw_aff; @@ -196,32 +280,33 @@ class pw_aff; class pw_aff_list; class pw_multi_aff; class pw_multi_aff_list; -class pw_qpolynomial; -class pw_qpolynomial_fold_list; -class pw_qpolynomial_list; -class qpolynomial; -class qpolynomial_list; class schedule; class schedule_constraints; class schedule_node; +class schedule_node_band; +class schedule_node_context; +class schedule_node_domain; +class schedule_node_expansion; +class schedule_node_extension; +class schedule_node_filter; +class schedule_node_guard; +class schedule_node_leaf; +class schedule_node_mark; +class schedule_node_sequence; +class schedule_node_set; class set; class set_list; class space; -class term; class union_access_info; class union_flow; class union_map; -class union_map_list; class union_pw_aff; class union_pw_aff_list; class union_pw_multi_aff; -class union_pw_multi_aff_list; -class union_pw_qpolynomial; class union_set; class union_set_list; class val; class val_list; -class vec; // declarations for isl::aff inline aff manage(__isl_take isl_aff *ptr); @@ -231,6 +316,7 @@ class aff { friend inline aff manage(__isl_take isl_aff *ptr); friend inline aff manage_copy(__isl_keep isl_aff *ptr); +protected: isl_aff *ptr = nullptr; inline explicit aff(__isl_take isl_aff *ptr); @@ -249,81 +335,182 @@ class aff { inline __isl_give isl_aff *release(); inline bool is_null() const; inline isl::ctx ctx() const; - inline void dump() const; inline isl::aff add(isl::aff aff2) const; - inline isl::aff add_coefficient_si(isl::dim type, int pos, int v) const; - inline isl::aff add_coefficient_val(isl::dim type, int pos, isl::val v) const; + inline isl::multi_aff add(const isl::multi_aff &multi2) const; + inline isl::multi_pw_aff add(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff add(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_aff add(const isl::pw_aff &pwaff2) const; + inline isl::pw_multi_aff add(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_aff add(const isl::union_pw_aff &upa2) const; + inline isl::union_pw_multi_aff add(const isl::union_pw_multi_aff &upma2) const; inline isl::aff add_constant(isl::val v) const; - inline isl::aff add_constant_num_si(int v) const; + inline isl::aff add_constant(long v) const; + inline isl::multi_aff add_constant(const isl::multi_val &mv) const; inline isl::aff add_constant_si(int v) const; - inline isl::aff add_dims(isl::dim type, unsigned int n) const; - inline isl::aff align_params(isl::space model) const; + inline isl::pw_aff add_dims(isl::dim type, unsigned int n) const; + inline isl::union_pw_multi_aff add_pw_multi_aff(const isl::pw_multi_aff &pma) const; + inline isl::union_pw_multi_aff apply(const isl::union_pw_multi_aff &upma2) const; + inline isl::aff as_aff() const; + inline isl::map as_map() const; + inline isl::multi_aff as_multi_aff() const; + inline isl::multi_union_pw_aff as_multi_union_pw_aff() const; + inline isl::pw_multi_aff as_pw_multi_aff() const; + inline isl::set as_set() const; + inline isl::union_map as_union_map() const; + inline isl::aff at(int pos) const; inline isl::basic_set bind(isl::id id) const; + inline isl::basic_set bind(const std::string &id) const; + inline isl::basic_set bind(const isl::multi_id &tuple) const; + inline isl::pw_aff bind_domain(const isl::multi_id &tuple) const; + inline isl::pw_aff bind_domain_wrapped_domain(const isl::multi_id &tuple) const; inline isl::aff ceil() const; - inline int coefficient_sgn(isl::dim type, int pos) const; - inline isl_size dim(isl::dim type) const; + inline isl::pw_aff coalesce() const; + inline isl::pw_aff cond(const isl::pw_aff &pwaff_true, const isl::pw_aff &pwaff_false) const; + inline isl::multi_val constant_multi_val() const; + inline isl::val constant_val() const; + inline isl::val get_constant_val() const; + inline isl::val denominator_val() const; + inline isl::val get_denominator_val() const; + inline class size dim(isl::dim type) const; + inline isl::id dim_id(isl::dim type, unsigned int pos) const; inline isl::aff div(isl::aff aff2) const; - inline isl::aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::basic_set eq_basic_set(isl::aff aff2) const; + inline isl::pw_aff div(const isl::pw_aff &pa2) const; + inline isl::set domain() const; + inline isl::space domain_space() const; + inline isl::pw_multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const; inline isl::set eq_set(isl::aff aff2) const; + inline isl::set eq_set(const isl::pw_aff &pwaff2) const; inline isl::val eval(isl::point pnt) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; + inline isl::pw_multi_aff extract_pw_multi_aff(const isl::space &space) const; + inline isl::multi_aff flat_range_product(const isl::multi_aff &multi2) const; + inline isl::multi_pw_aff flat_range_product(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff flat_range_product(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff flat_range_product(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff flat_range_product(const isl::union_pw_multi_aff &upma2) const; inline isl::aff floor() const; - inline isl::aff from_range() const; - inline isl::basic_set ge_basic_set(isl::aff aff2) const; + inline stat foreach_piece(const std::function &fn) const; + inline stat foreach_piece(const std::function &fn) const; + inline stat foreach_pw_aff(const std::function &fn) const; inline isl::set ge_set(isl::aff aff2) const; - inline isl::val get_coefficient_val(isl::dim type, int pos) const; - inline isl::val get_constant_val() const; - inline isl::val get_denominator_val() const; - inline std::string get_dim_name(isl::dim type, unsigned int pos) const; - inline isl::aff get_div(int pos) const; - inline isl::local_space get_domain_local_space() const; - inline isl::space get_domain_space() const; - inline uint32_t get_hash() const; - inline isl::local_space get_local_space() const; - inline isl::space get_space() const; + inline isl::set ge_set(const isl::pw_aff &pwaff2) const; inline isl::aff gist(isl::set context) const; - inline isl::aff gist_params(isl::set context) const; - inline isl::basic_set gt_basic_set(isl::aff aff2) const; + inline isl::union_pw_aff gist(const isl::union_set &context) const; + inline isl::aff gist(const isl::basic_set &context) const; + inline isl::aff gist(const isl::point &context) const; inline isl::set gt_set(isl::aff aff2) const; - inline isl::aff insert_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::set gt_set(const isl::pw_aff &pwaff2) const; + inline boolean has_range_tuple_id() const; + inline isl::multi_aff identity() const; + inline isl::pw_aff insert_domain(const isl::space &domain) const; + inline isl::pw_aff intersect_domain(const isl::set &set) const; + inline isl::union_pw_aff intersect_domain(const isl::space &space) const; + inline isl::union_pw_aff intersect_domain(const isl::union_set &uset) const; + inline isl::union_pw_aff intersect_domain_wrapped_domain(const isl::union_set &uset) const; + inline isl::union_pw_aff intersect_domain_wrapped_range(const isl::union_set &uset) const; + inline isl::pw_aff intersect_params(const isl::set &set) const; inline boolean involves_locals() const; + inline boolean involves_nan() const; + inline boolean involves_param(const isl::id &id) const; + inline boolean involves_param(const std::string &id) const; + inline boolean involves_param(const isl::id_list &list) const; inline boolean is_cst() const; - inline boolean is_nan() const; - inline isl::basic_set le_basic_set(isl::aff aff2) const; + inline boolean is_equal(const isl::pw_aff &pa2) const; + inline boolean isa_aff() const; + inline boolean isa_multi_aff() const; + inline boolean isa_pw_multi_aff() const; inline isl::set le_set(isl::aff aff2) const; - inline isl::basic_set lt_basic_set(isl::aff aff2) const; + inline isl::set le_set(const isl::pw_aff &pwaff2) const; + inline isl::aff_list list() const; inline isl::set lt_set(isl::aff aff2) const; + inline isl::set lt_set(const isl::pw_aff &pwaff2) const; + inline isl::multi_pw_aff max(const isl::multi_pw_aff &multi2) const; + inline isl::pw_aff max(const isl::pw_aff &pwaff2) const; + inline isl::multi_val max_multi_val() const; + inline isl::multi_pw_aff min(const isl::multi_pw_aff &multi2) const; + inline isl::pw_aff min(const isl::pw_aff &pwaff2) const; + inline isl::multi_val min_multi_val() const; inline isl::aff mod(isl::val mod) const; - inline isl::aff move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; + inline isl::aff mod(long mod) const; inline isl::aff mul(isl::aff aff2) const; - static inline isl::aff nan_on_domain(isl::local_space ls); - static inline isl::aff nan_on_domain_space(isl::space space); + inline isl::pw_aff mul(const isl::pw_aff &pwaff2) const; + inline class size n_piece() const; inline isl::set ne_set(isl::aff aff2) const; + inline isl::set ne_set(const isl::pw_aff &pwaff2) const; inline isl::aff neg() const; - inline isl::basic_set neg_basic_set() const; - static inline isl::aff param_on_domain_space_id(isl::space space, isl::id id); - inline boolean plain_is_equal(const isl::aff &aff2) const; - inline boolean plain_is_zero() const; - inline isl::aff project_domain_on_params() const; + inline boolean plain_is_empty() const; + inline boolean plain_is_equal(const isl::multi_aff &multi2) const; + inline boolean plain_is_equal(const isl::multi_pw_aff &multi2) const; + inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff preimage_domain_wrapped_domain(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const; + inline isl::multi_aff product(const isl::multi_aff &multi2) const; + inline isl::multi_pw_aff product(const isl::multi_pw_aff &multi2) const; + inline isl::pw_multi_aff product(const isl::pw_multi_aff &pma2) const; inline isl::aff pullback(isl::multi_aff ma) const; - inline isl::aff pullback_aff(isl::aff aff2) const; + inline isl::pw_aff pullback(const isl::multi_pw_aff &mpa) const; + inline isl::pw_aff pullback(const isl::pw_multi_aff &pma) const; + inline isl::union_pw_aff pullback(const isl::union_pw_multi_aff &upma) const; + inline isl::aff pullback(const isl::aff &ma) const; + inline isl::pw_multi_aff_list pw_multi_aff_list() const; + inline isl::pw_multi_aff range_factor_domain() const; + inline isl::pw_multi_aff range_factor_range() const; + inline isl::multi_aff range_product(const isl::multi_aff &multi2) const; + inline isl::multi_pw_aff range_product(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff range_product(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff range_product(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff range_product(const isl::union_pw_multi_aff &upma2) const; + inline isl::id range_tuple_id() const; + inline isl::multi_aff reset_range_tuple_id() const; + inline isl::multi_aff reset_tuple_id(isl::dim type) const; inline isl::aff scale(isl::val v) const; + inline isl::aff scale(long v) const; + inline isl::multi_aff scale(const isl::multi_val &mv) const; inline isl::aff scale_down(isl::val v) const; - inline isl::aff scale_down_ui(unsigned int f) const; - inline isl::aff set_coefficient_si(isl::dim type, int pos, int v) const; - inline isl::aff set_coefficient_val(isl::dim type, int pos, isl::val v) const; + inline isl::aff scale_down(long v) const; + inline isl::multi_aff scale_down(const isl::multi_val &mv) const; + inline isl::multi_aff set_aff(int pos, const isl::aff &el) const; + inline isl::multi_aff set_at(int pos, const isl::aff &el) const; + inline isl::multi_pw_aff set_at(int pos, const isl::pw_aff &el) const; + inline isl::multi_union_pw_aff set_at(int pos, const isl::union_pw_aff &el) const; inline isl::aff set_constant_si(int v) const; - inline isl::aff set_constant_val(isl::val v) const; - inline isl::aff set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; - inline isl::aff set_tuple_id(isl::dim type, isl::id id) const; + inline isl::multi_pw_aff set_pw_aff(int pos, const isl::pw_aff &el) const; + inline isl::pw_multi_aff set_pw_aff(unsigned int pos, const isl::pw_aff &pa) const; + inline isl::multi_aff set_range_tuple(const isl::id &id) const; + inline isl::multi_aff set_range_tuple(const std::string &id) const; + inline isl::pw_aff set_tuple_id(isl::dim type, const isl::id &id) const; + inline isl::pw_aff set_tuple_id(isl::dim type, const std::string &id) const; + inline isl::multi_union_pw_aff set_union_pw_aff(int pos, const isl::union_pw_aff &el) const; + inline class size size() const; + inline isl::space space() const; inline isl::aff sub(isl::aff aff2) const; + inline isl::multi_aff sub(const isl::multi_aff &multi2) const; + inline isl::multi_pw_aff sub(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff sub(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_aff sub(const isl::pw_aff &pwaff2) const; + inline isl::pw_multi_aff sub(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_aff sub(const isl::union_pw_aff &upa2) const; + inline isl::union_pw_multi_aff sub(const isl::union_pw_multi_aff &upma2) const; + inline isl::pw_aff subtract_domain(const isl::set &set) const; + inline isl::union_pw_aff subtract_domain(const isl::space &space) const; + inline isl::union_pw_aff subtract_domain(const isl::union_set &uset) const; + inline isl::pw_aff tdiv_q(const isl::pw_aff &pa2) const; + inline isl::pw_aff tdiv_r(const isl::pw_aff &pa2) const; + inline isl::aff_list to_list() const; + inline isl::multi_pw_aff to_multi_pw_aff() const; + inline isl::multi_union_pw_aff to_multi_union_pw_aff() const; + inline isl::pw_multi_aff to_pw_multi_aff() const; + inline isl::union_pw_aff to_union_pw_aff() const; + inline isl::union_pw_multi_aff to_union_pw_multi_aff() const; + inline isl::id tuple_id(isl::dim type) const; inline isl::aff unbind_params_insert_domain(isl::multi_id domain) const; - static inline isl::aff val_on_domain_space(isl::space space, isl::val val); + inline isl::multi_pw_aff union_add(const isl::multi_pw_aff &mpa2) const; + inline isl::multi_union_pw_aff union_add(const isl::multi_union_pw_aff &mupa2) const; + inline isl::pw_aff union_add(const isl::pw_aff &pwaff2) const; + inline isl::pw_multi_aff union_add(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_aff union_add(const isl::union_pw_aff &upa2) const; + inline isl::union_pw_multi_aff union_add(const isl::union_pw_multi_aff &upma2) const; static inline isl::aff var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos); - inline isl::basic_set zero_basic_set() const; static inline isl::aff zero_on_domain(isl::space space); }; @@ -335,6 +522,7 @@ class aff_list { friend inline aff_list manage(__isl_take isl_aff_list *ptr); friend inline aff_list manage_copy(__isl_keep isl_aff_list *ptr); +protected: isl_aff_list *ptr = nullptr; inline explicit aff_list(__isl_take isl_aff_list *ptr); @@ -342,6 +530,9 @@ class aff_list { public: inline /* implicit */ aff_list(); inline /* implicit */ aff_list(const aff_list &obj); + inline explicit aff_list(isl::ctx ctx, int n); + inline explicit aff_list(isl::aff el); + inline explicit aff_list(isl::ctx ctx, const std::string &str); inline aff_list &operator=(aff_list obj); inline ~aff_list(); inline __isl_give isl_aff_list *copy() const &; @@ -350,23 +541,16 @@ class aff_list { inline __isl_give isl_aff_list *release(); inline bool is_null() const; inline isl::ctx ctx() const; - inline void dump() const; inline isl::aff_list add(isl::aff el) const; - static inline isl::aff_list alloc(isl::ctx ctx, int n); + inline isl::aff at(int index) const; + inline isl::aff get_at(int index) const; inline isl::aff_list clear() const; inline isl::aff_list concat(isl::aff_list list2) const; inline isl::aff_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::aff_list from_aff(isl::aff el); - inline isl::aff get_aff(int index) const; - inline isl::aff get_at(int index) const; + inline stat foreach(const std::function &fn) const; inline isl::aff_list insert(unsigned int pos, isl::aff el) const; - inline isl_size n_aff() const; - inline isl::aff_list reverse() const; - inline isl::aff_list set_aff(int index, isl::aff el) const; - inline isl_size size() const; - inline isl::aff_list swap(unsigned int pos1, unsigned int pos2) const; + inline class size size() const; }; // declarations for isl::ast_build @@ -377,6 +561,7 @@ class ast_build { friend inline ast_build manage(__isl_take isl_ast_build *ptr); friend inline ast_build manage_copy(__isl_keep isl_ast_build *ptr); +protected: isl_ast_build *ptr = nullptr; inline explicit ast_build(__isl_take isl_ast_build *ptr); @@ -394,19 +579,28 @@ class ast_build { inline bool is_null() const; inline isl::ctx ctx() const; +private: + inline ast_build ©_callbacks(const ast_build &obj); + struct at_each_domain_data { + std::function func; + }; + std::shared_ptr at_each_domain_data; + static inline isl_ast_node *at_each_domain(isl_ast_node *arg_0, isl_ast_build *arg_1, void *arg_2); + inline void set_at_each_domain_data(const std::function &fn); +public: + inline isl::ast_build set_at_each_domain(const std::function &fn) const; inline isl::ast_expr access_from(isl::multi_pw_aff mpa) const; inline isl::ast_expr access_from(isl::pw_multi_aff pma) const; - inline isl::ast_node ast_from_schedule(isl::union_map schedule) const; inline isl::ast_expr call_from(isl::multi_pw_aff mpa) const; inline isl::ast_expr call_from(isl::pw_multi_aff pma) const; inline isl::ast_expr expr_from(isl::pw_aff pa) const; inline isl::ast_expr expr_from(isl::set set) const; static inline isl::ast_build from_context(isl::set set); - inline isl::union_map get_schedule() const; - inline isl::space get_schedule_space() const; inline isl::ast_node node_from(isl::schedule schedule) const; inline isl::ast_node node_from_schedule_map(isl::union_map schedule) const; inline isl::ast_build restrict(isl::set set) const; + inline isl::union_map schedule() const; + inline isl::union_map get_schedule() const; }; // declarations for isl::ast_expr @@ -417,6 +611,7 @@ class ast_expr { friend inline ast_expr manage(__isl_take isl_ast_expr *ptr); friend inline ast_expr manage_copy(__isl_keep isl_ast_expr *ptr); +protected: isl_ast_expr *ptr = nullptr; inline explicit ast_expr(__isl_take isl_ast_expr *ptr); @@ -431,5813 +626,8255 @@ class ast_expr { inline __isl_keep isl_ast_expr *get() const; inline __isl_give isl_ast_expr *release(); inline bool is_null() const; +private: + template ::value>::type> + inline boolean isa_type(T subtype) const; +public: + template inline boolean isa() const; + template inline T as() const; inline isl::ctx ctx() const; - inline void dump() const; - inline isl::ast_expr access(isl::ast_expr_list indices) const; inline isl::ast_expr add(isl::ast_expr expr2) const; inline isl::ast_expr address_of() const; - inline isl::ast_expr call(isl::ast_expr_list arguments) const; - inline isl::ast_expr div(isl::ast_expr expr2) const; inline isl::ast_expr eq(isl::ast_expr expr2) const; - static inline isl::ast_expr from_id(isl::id id); static inline isl::ast_expr from_val(isl::val v); - inline isl::ast_expr ge(isl::ast_expr expr2) const; + inline isl::id id() const; inline isl::id get_id() const; - inline isl::ast_expr get_op_arg(int pos) const; - inline isl_size get_op_n_arg() const; - inline isl::val get_val() const; - inline isl::ast_expr gt(isl::ast_expr expr2) const; - inline isl::id id_get_id() const; - inline isl::val int_get_val() const; - inline boolean is_equal(const isl::ast_expr &expr2) const; inline isl::ast_expr le(isl::ast_expr expr2) const; - inline isl::ast_expr lt(isl::ast_expr expr2) const; inline isl::ast_expr mul(isl::ast_expr expr2) const; - inline isl::ast_expr neg() const; - inline isl::ast_expr op_get_arg(int pos) const; - inline isl_size op_get_n_arg() const; - inline isl::ast_expr pdiv_q(isl::ast_expr expr2) const; - inline isl::ast_expr pdiv_r(isl::ast_expr expr2) const; - inline isl::ast_expr set_op_arg(int pos, isl::ast_expr arg) const; - inline isl::ast_expr sub(isl::ast_expr expr2) const; - inline isl::ast_expr substitute_ids(isl::id_to_ast_expr id2expr) const; + inline isl::ast_expr op_arg(int pos) const; + inline isl::ast_expr get_op_arg(int pos) const; inline std::string to_C_str() const; + inline isl::val val() const; + inline isl::val get_val() const; }; -// declarations for isl::ast_expr_list -inline ast_expr_list manage(__isl_take isl_ast_expr_list *ptr); -inline ast_expr_list manage_copy(__isl_keep isl_ast_expr_list *ptr); - -class ast_expr_list { - friend inline ast_expr_list manage(__isl_take isl_ast_expr_list *ptr); - friend inline ast_expr_list manage_copy(__isl_keep isl_ast_expr_list *ptr); +// declarations for isl::ast_expr_id - isl_ast_expr_list *ptr = nullptr; +class ast_expr_id : public ast_expr { + template + friend boolean ast_expr::isa() const; + friend ast_expr_id ast_expr::as() const; + static const auto type = isl_ast_expr_id; - inline explicit ast_expr_list(__isl_take isl_ast_expr_list *ptr); +protected: + inline explicit ast_expr_id(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ ast_expr_list(); - inline /* implicit */ ast_expr_list(const ast_expr_list &obj); - inline ast_expr_list &operator=(ast_expr_list obj); - inline ~ast_expr_list(); - inline __isl_give isl_ast_expr_list *copy() const &; - inline __isl_give isl_ast_expr_list *copy() && = delete; - inline __isl_keep isl_ast_expr_list *get() const; - inline __isl_give isl_ast_expr_list *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_id(); + inline /* implicit */ ast_expr_id(const ast_expr_id &obj); + inline ast_expr_id &operator=(ast_expr_id obj); inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::ast_expr_list add(isl::ast_expr el) const; - static inline isl::ast_expr_list alloc(isl::ctx ctx, int n); - inline isl::ast_expr_list clear() const; - inline isl::ast_expr_list concat(isl::ast_expr_list list2) const; - inline isl::ast_expr_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::ast_expr_list from_ast_expr(isl::ast_expr el); - inline isl::ast_expr get_ast_expr(int index) const; - inline isl::ast_expr get_at(int index) const; - inline isl::ast_expr_list insert(unsigned int pos, isl::ast_expr el) const; - inline isl_size n_ast_expr() const; - inline isl::ast_expr_list reverse() const; - inline isl::ast_expr_list set_ast_expr(int index, isl::ast_expr el) const; - inline isl_size size() const; - inline isl::ast_expr_list swap(unsigned int pos1, unsigned int pos2) const; -}; -// declarations for isl::ast_node -inline ast_node manage(__isl_take isl_ast_node *ptr); -inline ast_node manage_copy(__isl_keep isl_ast_node *ptr); + inline isl::id id() const; + inline isl::id get_id() const; +}; -class ast_node { - friend inline ast_node manage(__isl_take isl_ast_node *ptr); - friend inline ast_node manage_copy(__isl_keep isl_ast_node *ptr); +// declarations for isl::ast_expr_int - isl_ast_node *ptr = nullptr; +class ast_expr_int : public ast_expr { + template + friend boolean ast_expr::isa() const; + friend ast_expr_int ast_expr::as() const; + static const auto type = isl_ast_expr_int; - inline explicit ast_node(__isl_take isl_ast_node *ptr); +protected: + inline explicit ast_expr_int(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ ast_node(); - inline /* implicit */ ast_node(const ast_node &obj); - inline ast_node &operator=(ast_node obj); - inline ~ast_node(); - inline __isl_give isl_ast_node *copy() const &; - inline __isl_give isl_ast_node *copy() && = delete; - inline __isl_keep isl_ast_node *get() const; - inline __isl_give isl_ast_node *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_int(); + inline /* implicit */ ast_expr_int(const ast_expr_int &obj); + inline ast_expr_int &operator=(ast_expr_int obj); inline isl::ctx ctx() const; - inline void dump() const; - - static inline isl::ast_node alloc_user(isl::ast_expr expr); - inline isl::ast_node_list block_get_children() const; - inline isl::ast_node for_get_body() const; - inline isl::ast_expr for_get_cond() const; - inline isl::ast_expr for_get_inc() const; - inline isl::ast_expr for_get_init() const; - inline isl::ast_expr for_get_iterator() const; - inline boolean for_is_degenerate() const; - inline isl::id get_annotation() const; - inline isl::ast_expr if_get_cond() const; - inline isl::ast_node if_get_else() const; - inline isl::ast_node if_get_else_node() const; - inline isl::ast_node if_get_then() const; - inline isl::ast_node if_get_then_node() const; - inline boolean if_has_else() const; - inline boolean if_has_else_node() const; - inline isl::id mark_get_id() const; - inline isl::ast_node mark_get_node() const; - inline isl::ast_node set_annotation(isl::id annotation) const; - inline std::string to_C_str() const; - inline isl::ast_expr user_get_expr() const; -}; -// declarations for isl::ast_node_list -inline ast_node_list manage(__isl_take isl_ast_node_list *ptr); -inline ast_node_list manage_copy(__isl_keep isl_ast_node_list *ptr); + inline isl::val val() const; + inline isl::val get_val() const; +}; -class ast_node_list { - friend inline ast_node_list manage(__isl_take isl_ast_node_list *ptr); - friend inline ast_node_list manage_copy(__isl_keep isl_ast_node_list *ptr); +// declarations for isl::ast_expr_op - isl_ast_node_list *ptr = nullptr; +class ast_expr_op : public ast_expr { + template + friend boolean ast_expr::isa() const; + friend ast_expr_op ast_expr::as() const; + static const auto type = isl_ast_expr_op; - inline explicit ast_node_list(__isl_take isl_ast_node_list *ptr); +protected: + inline explicit ast_expr_op(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ ast_node_list(); - inline /* implicit */ ast_node_list(const ast_node_list &obj); - inline ast_node_list &operator=(ast_node_list obj); - inline ~ast_node_list(); - inline __isl_give isl_ast_node_list *copy() const &; - inline __isl_give isl_ast_node_list *copy() && = delete; - inline __isl_keep isl_ast_node_list *get() const; - inline __isl_give isl_ast_node_list *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op(); + inline /* implicit */ ast_expr_op(const ast_expr_op &obj); + inline ast_expr_op &operator=(ast_expr_op obj); +private: + template ::value>::type> + inline boolean isa_type(T subtype) const; +public: + template inline boolean isa() const; + template inline T as() const; inline isl::ctx ctx() const; - inline void dump() const; - inline isl::ast_node_list add(isl::ast_node el) const; - static inline isl::ast_node_list alloc(isl::ctx ctx, int n); - inline isl::ast_node_list clear() const; - inline isl::ast_node_list concat(isl::ast_node_list list2) const; - inline isl::ast_node_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::ast_node_list from_ast_node(isl::ast_node el); - inline isl::ast_node get_ast_node(int index) const; - inline isl::ast_node get_at(int index) const; - inline isl::ast_node_list insert(unsigned int pos, isl::ast_node el) const; - inline isl_size n_ast_node() const; - inline isl::ast_node_list reverse() const; - inline isl::ast_node_list set_ast_node(int index, isl::ast_node el) const; - inline isl_size size() const; - inline isl::ast_node_list swap(unsigned int pos1, unsigned int pos2) const; + inline isl::ast_expr arg(int pos) const; + inline isl::ast_expr get_arg(int pos) const; + inline class size n_arg() const; + inline class size get_n_arg() const; }; -// declarations for isl::basic_map -inline basic_map manage(__isl_take isl_basic_map *ptr); -inline basic_map manage_copy(__isl_keep isl_basic_map *ptr); - -class basic_map { - friend inline basic_map manage(__isl_take isl_basic_map *ptr); - friend inline basic_map manage_copy(__isl_keep isl_basic_map *ptr); +// declarations for isl::ast_expr_op_access - isl_basic_map *ptr = nullptr; +class ast_expr_op_access : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_access ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_access; - inline explicit basic_map(__isl_take isl_basic_map *ptr); +protected: + inline explicit ast_expr_op_access(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ basic_map(); - inline /* implicit */ basic_map(const basic_map &obj); - inline explicit basic_map(isl::ctx ctx, const std::string &str); - inline basic_map &operator=(basic_map obj); - inline ~basic_map(); - inline __isl_give isl_basic_map *copy() const &; - inline __isl_give isl_basic_map *copy() && = delete; - inline __isl_keep isl_basic_map *get() const; - inline __isl_give isl_basic_map *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_access(); + inline /* implicit */ ast_expr_op_access(const ast_expr_op_access &obj); + inline ast_expr_op_access &operator=(ast_expr_op_access obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::basic_map add_constraint(isl::constraint constraint) const; - inline isl::basic_map add_dims(isl::dim type, unsigned int n) const; - inline isl::basic_map affine_hull() const; - inline isl::basic_map align_params(isl::space model) const; - inline isl::basic_map apply_domain(isl::basic_map bmap2) const; - inline isl::basic_map apply_range(isl::basic_map bmap2) const; - inline boolean can_curry() const; - inline boolean can_uncurry() const; - inline boolean can_zip() const; - inline isl::basic_map curry() const; - inline isl::basic_set deltas() const; - inline isl::basic_map deltas_map() const; - inline isl::basic_map detect_equalities() const; - inline isl_size dim(isl::dim type) const; - inline isl::basic_set domain() const; - inline isl::basic_map domain_map() const; - inline isl::basic_map domain_product(isl::basic_map bmap2) const; - inline isl::basic_map drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::basic_map drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::basic_map drop_unused_params() const; - inline isl::basic_map eliminate(isl::dim type, unsigned int first, unsigned int n) const; - static inline isl::basic_map empty(isl::space space); - static inline isl::basic_map equal(isl::space space, unsigned int n_equal); - inline isl::mat equalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4, isl::dim c5) const; - inline isl::basic_map equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::basic_map fix_si(isl::dim type, unsigned int pos, int value) const; - inline isl::basic_map fix_val(isl::dim type, unsigned int pos, isl::val v) const; - inline isl::basic_map flat_product(isl::basic_map bmap2) const; - inline isl::basic_map flat_range_product(isl::basic_map bmap2) const; - inline isl::basic_map flatten() const; - inline isl::basic_map flatten_domain() const; - inline isl::basic_map flatten_range() const; - inline stat foreach_constraint(const std::function &fn) const; - static inline isl::basic_map from_aff(isl::aff aff); - static inline isl::basic_map from_aff_list(isl::space domain_space, isl::aff_list list); - static inline isl::basic_map from_constraint(isl::constraint constraint); - static inline isl::basic_map from_domain(isl::basic_set bset); - static inline isl::basic_map from_domain_and_range(isl::basic_set domain, isl::basic_set range); - static inline isl::basic_map from_multi_aff(isl::multi_aff maff); - static inline isl::basic_map from_qpolynomial(isl::qpolynomial qp); - static inline isl::basic_map from_range(isl::basic_set bset); - inline isl::constraint_list get_constraint_list() const; - inline std::string get_dim_name(isl::dim type, unsigned int pos) const; - inline isl::aff get_div(int pos) const; - inline isl::local_space get_local_space() const; - inline isl::space get_space() const; - inline std::string get_tuple_name(isl::dim type) const; - inline isl::basic_map gist(isl::basic_map context) const; - inline isl::basic_map gist_domain(isl::basic_set context) const; - inline boolean has_dim_id(isl::dim type, unsigned int pos) const; - static inline isl::basic_map identity(isl::space space); - inline boolean image_is_bounded() const; - inline isl::mat inequalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4, isl::dim c5) const; - inline isl::basic_map insert_dims(isl::dim type, unsigned int pos, unsigned int n) const; - inline isl::basic_map intersect(isl::basic_map bmap2) const; - inline isl::basic_map intersect_domain(isl::basic_set bset) const; - inline isl::basic_map intersect_range(isl::basic_set bset) const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean is_disjoint(const isl::basic_map &bmap2) const; - inline boolean is_empty() const; - inline boolean is_equal(const isl::basic_map &bmap2) const; - inline boolean is_rational() const; - inline boolean is_single_valued() const; - inline boolean is_strict_subset(const isl::basic_map &bmap2) const; - inline boolean is_subset(const isl::basic_map &bmap2) const; - inline boolean is_universe() const; - static inline isl::basic_map less_at(isl::space space, unsigned int pos); - inline isl::map lexmax() const; - inline isl::map lexmin() const; - inline isl::pw_multi_aff lexmin_pw_multi_aff() const; - inline isl::basic_map lower_bound_si(isl::dim type, unsigned int pos, int value) const; - static inline isl::basic_map more_at(isl::space space, unsigned int pos); - inline isl::basic_map move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; - inline isl_size n_constraint() const; - static inline isl::basic_map nat_universe(isl::space space); - inline isl::basic_map neg() const; - inline isl::basic_map order_ge(isl::dim type1, int pos1, isl::dim type2, int pos2) const; - inline isl::basic_map order_gt(isl::dim type1, int pos1, isl::dim type2, int pos2) const; - inline isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const; - inline boolean plain_is_empty() const; - inline boolean plain_is_universe() const; - inline isl::basic_map preimage_domain_multi_aff(isl::multi_aff ma) const; - inline isl::basic_map preimage_range_multi_aff(isl::multi_aff ma) const; - inline isl::basic_map product(isl::basic_map bmap2) const; - inline isl::basic_map project_out(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::basic_set range() const; - inline isl::basic_map range_map() const; - inline isl::basic_map range_product(isl::basic_map bmap2) const; - inline isl::basic_map remove_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::basic_map remove_divs() const; - inline isl::basic_map remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::basic_map remove_redundancies() const; - inline isl::basic_map reverse() const; - inline isl::basic_map sample() const; - inline isl::basic_map set_tuple_id(isl::dim type, isl::id id) const; - inline isl::basic_map set_tuple_name(isl::dim type, const std::string &s) const; - inline isl::basic_map sum(isl::basic_map bmap2) const; - inline isl::basic_map uncurry() const; - inline isl::map unite(isl::basic_map bmap2) const; - static inline isl::basic_map universe(isl::space space); - inline isl::basic_map upper_bound_si(isl::dim type, unsigned int pos, int value) const; - inline isl::basic_set wrap() const; - inline isl::basic_map zip() const; }; -// declarations for isl::basic_map_list -inline basic_map_list manage(__isl_take isl_basic_map_list *ptr); -inline basic_map_list manage_copy(__isl_keep isl_basic_map_list *ptr); - -class basic_map_list { - friend inline basic_map_list manage(__isl_take isl_basic_map_list *ptr); - friend inline basic_map_list manage_copy(__isl_keep isl_basic_map_list *ptr); +// declarations for isl::ast_expr_op_add - isl_basic_map_list *ptr = nullptr; +class ast_expr_op_add : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_add ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_add; - inline explicit basic_map_list(__isl_take isl_basic_map_list *ptr); +protected: + inline explicit ast_expr_op_add(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ basic_map_list(); - inline /* implicit */ basic_map_list(const basic_map_list &obj); - inline basic_map_list &operator=(basic_map_list obj); - inline ~basic_map_list(); - inline __isl_give isl_basic_map_list *copy() const &; - inline __isl_give isl_basic_map_list *copy() && = delete; - inline __isl_keep isl_basic_map_list *get() const; - inline __isl_give isl_basic_map_list *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_add(); + inline /* implicit */ ast_expr_op_add(const ast_expr_op_add &obj); + inline ast_expr_op_add &operator=(ast_expr_op_add obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::basic_map_list add(isl::basic_map el) const; - static inline isl::basic_map_list alloc(isl::ctx ctx, int n); - inline isl::basic_map_list clear() const; - inline isl::basic_map_list concat(isl::basic_map_list list2) const; - inline isl::basic_map_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::basic_map_list from_basic_map(isl::basic_map el); - inline isl::basic_map get_at(int index) const; - inline isl::basic_map get_basic_map(int index) const; - inline isl::basic_map_list insert(unsigned int pos, isl::basic_map el) const; - inline isl_size n_basic_map() const; - inline isl::basic_map_list reverse() const; - inline isl::basic_map_list set_basic_map(int index, isl::basic_map el) const; - inline isl_size size() const; - inline isl::basic_map_list swap(unsigned int pos1, unsigned int pos2) const; }; -// declarations for isl::basic_set -inline basic_set manage(__isl_take isl_basic_set *ptr); -inline basic_set manage_copy(__isl_keep isl_basic_set *ptr); - -class basic_set { - friend inline basic_set manage(__isl_take isl_basic_set *ptr); - friend inline basic_set manage_copy(__isl_keep isl_basic_set *ptr); +// declarations for isl::ast_expr_op_address_of - isl_basic_set *ptr = nullptr; +class ast_expr_op_address_of : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_address_of ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_address_of; - inline explicit basic_set(__isl_take isl_basic_set *ptr); +protected: + inline explicit ast_expr_op_address_of(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ basic_set(); - inline /* implicit */ basic_set(const basic_set &obj); - inline /* implicit */ basic_set(isl::point pnt); - inline explicit basic_set(isl::ctx ctx, const std::string &str); - inline basic_set &operator=(basic_set obj); - inline ~basic_set(); - inline __isl_give isl_basic_set *copy() const &; - inline __isl_give isl_basic_set *copy() && = delete; - inline __isl_keep isl_basic_set *get() const; - inline __isl_give isl_basic_set *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_address_of(); + inline /* implicit */ ast_expr_op_address_of(const ast_expr_op_address_of &obj); + inline ast_expr_op_address_of &operator=(ast_expr_op_address_of obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::basic_set affine_hull() const; - inline isl::basic_set align_params(isl::space model) const; - inline isl::basic_set apply(isl::basic_map bmap) const; - static inline isl::basic_set box_from_points(isl::point pnt1, isl::point pnt2); - inline isl::basic_set coefficients() const; - inline isl::basic_set detect_equalities() const; - inline isl_size dim(isl::dim type) const; - inline isl::val dim_max_val(int pos) const; - inline isl::basic_set drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::basic_set drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::basic_set drop_unused_params() const; - inline isl::basic_set eliminate(isl::dim type, unsigned int first, unsigned int n) const; - static inline isl::basic_set empty(isl::space space); - inline isl::mat equalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4) const; - inline isl::basic_set fix_si(isl::dim type, unsigned int pos, int value) const; - inline isl::basic_set fix_val(isl::dim type, unsigned int pos, isl::val v) const; - inline isl::basic_set flat_product(isl::basic_set bset2) const; - inline isl::basic_set flatten() const; - inline stat foreach_bound_pair(isl::dim type, unsigned int pos, const std::function &fn) const; - inline stat foreach_constraint(const std::function &fn) const; - static inline isl::basic_set from_constraint(isl::constraint constraint); - static inline isl::basic_set from_multi_aff(isl::multi_aff ma); - inline isl::basic_set from_params() const; - inline isl::constraint_list get_constraint_list() const; - inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; - inline std::string get_dim_name(isl::dim type, unsigned int pos) const; - inline isl::aff get_div(int pos) const; - inline isl::local_space get_local_space() const; - inline isl::space get_space() const; - inline std::string get_tuple_name() const; - inline isl::basic_set gist(isl::basic_set context) const; - inline isl::mat inequalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4) const; - inline isl::basic_set insert_dims(isl::dim type, unsigned int pos, unsigned int n) const; - inline isl::basic_set intersect(isl::basic_set bset2) const; - inline isl::basic_set intersect_params(isl::basic_set bset2) const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean is_bounded() const; - inline boolean is_disjoint(const isl::basic_set &bset2) const; - inline boolean is_empty() const; - inline boolean is_equal(const isl::basic_set &bset2) const; - inline int is_rational() const; - inline boolean is_subset(const isl::basic_set &bset2) const; - inline boolean is_universe() const; - inline boolean is_wrapping() const; - inline isl::set lexmax() const; - inline isl::set lexmin() const; - inline isl::basic_set lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const; - inline isl::val max_val(const isl::aff &obj) const; - inline isl::basic_set move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; - inline isl_size n_constraint() const; - inline isl_size n_dim() const; - static inline isl::basic_set nat_universe(isl::space space); - inline isl::basic_set neg() const; - inline isl::basic_set params() const; - inline boolean plain_is_empty() const; - inline boolean plain_is_equal(const isl::basic_set &bset2) const; - inline boolean plain_is_universe() const; - static inline isl::basic_set positive_orthant(isl::space space); - inline isl::basic_set preimage_multi_aff(isl::multi_aff ma) const; - inline isl::basic_set project_out(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::mat reduced_basis() const; - inline isl::basic_set remove_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::basic_set remove_divs() const; - inline isl::basic_set remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::basic_set remove_redundancies() const; - inline isl::basic_set remove_unknown_divs() const; - inline isl::basic_set sample() const; - inline isl::point sample_point() const; - inline isl::basic_set set_tuple_id(isl::id id) const; - inline isl::basic_set set_tuple_name(const std::string &s) const; - inline isl::basic_set solutions() const; - inline isl::set unite(isl::basic_set bset2) const; - static inline isl::basic_set universe(isl::space space); - inline isl::basic_map unwrap() const; - inline isl::basic_set upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const; }; -// declarations for isl::basic_set_list -inline basic_set_list manage(__isl_take isl_basic_set_list *ptr); -inline basic_set_list manage_copy(__isl_keep isl_basic_set_list *ptr); - -class basic_set_list { - friend inline basic_set_list manage(__isl_take isl_basic_set_list *ptr); - friend inline basic_set_list manage_copy(__isl_keep isl_basic_set_list *ptr); +// declarations for isl::ast_expr_op_and - isl_basic_set_list *ptr = nullptr; +class ast_expr_op_and : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_and ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_and; - inline explicit basic_set_list(__isl_take isl_basic_set_list *ptr); +protected: + inline explicit ast_expr_op_and(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ basic_set_list(); - inline /* implicit */ basic_set_list(const basic_set_list &obj); - inline basic_set_list &operator=(basic_set_list obj); - inline ~basic_set_list(); - inline __isl_give isl_basic_set_list *copy() const &; - inline __isl_give isl_basic_set_list *copy() && = delete; - inline __isl_keep isl_basic_set_list *get() const; - inline __isl_give isl_basic_set_list *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_and(); + inline /* implicit */ ast_expr_op_and(const ast_expr_op_and &obj); + inline ast_expr_op_and &operator=(ast_expr_op_and obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::basic_set_list add(isl::basic_set el) const; - static inline isl::basic_set_list alloc(isl::ctx ctx, int n); - inline isl::basic_set_list clear() const; - inline isl::basic_set_list coefficients() const; - inline isl::basic_set_list concat(isl::basic_set_list list2) const; - inline isl::basic_set_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::basic_set_list from_basic_set(isl::basic_set el); - inline isl::basic_set get_at(int index) const; - inline isl::basic_set get_basic_set(int index) const; - inline isl::basic_set_list insert(unsigned int pos, isl::basic_set el) const; - inline isl_size n_basic_set() const; - inline isl::basic_set_list reverse() const; - inline isl::basic_set_list set_basic_set(int index, isl::basic_set el) const; - inline isl_size size() const; - inline isl::basic_set_list swap(unsigned int pos1, unsigned int pos2) const; }; -// declarations for isl::constraint -inline constraint manage(__isl_take isl_constraint *ptr); -inline constraint manage_copy(__isl_keep isl_constraint *ptr); - -class constraint { - friend inline constraint manage(__isl_take isl_constraint *ptr); - friend inline constraint manage_copy(__isl_keep isl_constraint *ptr); +// declarations for isl::ast_expr_op_and_then - isl_constraint *ptr = nullptr; +class ast_expr_op_and_then : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_and_then ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_and_then; - inline explicit constraint(__isl_take isl_constraint *ptr); +protected: + inline explicit ast_expr_op_and_then(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ constraint(); - inline /* implicit */ constraint(const constraint &obj); - inline constraint &operator=(constraint obj); - inline ~constraint(); - inline __isl_give isl_constraint *copy() const &; - inline __isl_give isl_constraint *copy() && = delete; - inline __isl_keep isl_constraint *get() const; - inline __isl_give isl_constraint *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_and_then(); + inline /* implicit */ ast_expr_op_and_then(const ast_expr_op_and_then &obj); + inline ast_expr_op_and_then &operator=(ast_expr_op_and_then obj); inline isl::ctx ctx() const; - inline void dump() const; - static inline isl::constraint alloc_equality(isl::local_space ls); - static inline isl::constraint alloc_inequality(isl::local_space ls); - inline int cmp_last_non_zero(const isl::constraint &c2) const; - inline isl::aff get_aff() const; - inline isl::aff get_bound(isl::dim type, int pos) const; - inline isl::val get_coefficient_val(isl::dim type, int pos) const; - inline isl::val get_constant_val() const; - inline std::string get_dim_name(isl::dim type, unsigned int pos) const; - inline isl::aff get_div(int pos) const; - inline isl::local_space get_local_space() const; - inline isl::space get_space() const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean is_div_constraint() const; - inline boolean is_lower_bound(isl::dim type, unsigned int pos) const; - inline boolean is_upper_bound(isl::dim type, unsigned int pos) const; - inline int plain_cmp(const isl::constraint &c2) const; - inline isl::constraint set_coefficient_si(isl::dim type, int pos, int v) const; - inline isl::constraint set_coefficient_val(isl::dim type, int pos, isl::val v) const; - inline isl::constraint set_constant_si(int v) const; - inline isl::constraint set_constant_val(isl::val v) const; }; -// declarations for isl::constraint_list -inline constraint_list manage(__isl_take isl_constraint_list *ptr); -inline constraint_list manage_copy(__isl_keep isl_constraint_list *ptr); - -class constraint_list { - friend inline constraint_list manage(__isl_take isl_constraint_list *ptr); - friend inline constraint_list manage_copy(__isl_keep isl_constraint_list *ptr); +// declarations for isl::ast_expr_op_call - isl_constraint_list *ptr = nullptr; +class ast_expr_op_call : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_call ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_call; - inline explicit constraint_list(__isl_take isl_constraint_list *ptr); +protected: + inline explicit ast_expr_op_call(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ constraint_list(); - inline /* implicit */ constraint_list(const constraint_list &obj); - inline constraint_list &operator=(constraint_list obj); - inline ~constraint_list(); - inline __isl_give isl_constraint_list *copy() const &; - inline __isl_give isl_constraint_list *copy() && = delete; - inline __isl_keep isl_constraint_list *get() const; - inline __isl_give isl_constraint_list *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_call(); + inline /* implicit */ ast_expr_op_call(const ast_expr_op_call &obj); + inline ast_expr_op_call &operator=(ast_expr_op_call obj); inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::constraint_list add(isl::constraint el) const; - static inline isl::constraint_list alloc(isl::ctx ctx, int n); - inline isl::constraint_list clear() const; - inline isl::constraint_list concat(isl::constraint_list list2) const; - inline isl::constraint_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::constraint_list from_constraint(isl::constraint el); - inline isl::constraint get_at(int index) const; - inline isl::constraint get_constraint(int index) const; - inline isl::constraint_list insert(unsigned int pos, isl::constraint el) const; - inline isl_size n_constraint() const; - inline isl::constraint_list reverse() const; - inline isl::constraint_list set_constraint(int index, isl::constraint el) const; - inline isl_size size() const; - inline isl::constraint_list swap(unsigned int pos1, unsigned int pos2) const; -}; -// declarations for isl::fixed_box -inline fixed_box manage(__isl_take isl_fixed_box *ptr); -inline fixed_box manage_copy(__isl_keep isl_fixed_box *ptr); +}; -class fixed_box { - friend inline fixed_box manage(__isl_take isl_fixed_box *ptr); - friend inline fixed_box manage_copy(__isl_keep isl_fixed_box *ptr); +// declarations for isl::ast_expr_op_cond - isl_fixed_box *ptr = nullptr; +class ast_expr_op_cond : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_cond ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_cond; - inline explicit fixed_box(__isl_take isl_fixed_box *ptr); +protected: + inline explicit ast_expr_op_cond(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ fixed_box(); - inline /* implicit */ fixed_box(const fixed_box &obj); - inline fixed_box &operator=(fixed_box obj); - inline ~fixed_box(); - inline __isl_give isl_fixed_box *copy() const &; - inline __isl_give isl_fixed_box *copy() && = delete; - inline __isl_keep isl_fixed_box *get() const; - inline __isl_give isl_fixed_box *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_cond(); + inline /* implicit */ ast_expr_op_cond(const ast_expr_op_cond &obj); + inline ast_expr_op_cond &operator=(ast_expr_op_cond obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::multi_aff get_offset() const; - inline isl::multi_val get_size() const; - inline isl::space get_space() const; - inline boolean is_valid() const; }; -// declarations for isl::id -inline id manage(__isl_take isl_id *ptr); -inline id manage_copy(__isl_keep isl_id *ptr); - -class id { - friend inline id manage(__isl_take isl_id *ptr); - friend inline id manage_copy(__isl_keep isl_id *ptr); +// declarations for isl::ast_expr_op_div - isl_id *ptr = nullptr; +class ast_expr_op_div : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_div ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_div; - inline explicit id(__isl_take isl_id *ptr); +protected: + inline explicit ast_expr_op_div(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ id(); - inline /* implicit */ id(const id &obj); - inline explicit id(isl::ctx ctx, const std::string &str); - inline id &operator=(id obj); - inline ~id(); - inline __isl_give isl_id *copy() const &; - inline __isl_give isl_id *copy() && = delete; - inline __isl_keep isl_id *get() const; - inline __isl_give isl_id *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_div(); + inline /* implicit */ ast_expr_op_div(const ast_expr_op_div &obj); + inline ast_expr_op_div &operator=(ast_expr_op_div obj); inline isl::ctx ctx() const; - inline void dump() const; - static inline isl::id alloc(isl::ctx ctx, const std::string &name, void * user); - inline uint32_t get_hash() const; - inline std::string get_name() const; - inline void * get_user() const; }; -// declarations for isl::id_list -inline id_list manage(__isl_take isl_id_list *ptr); -inline id_list manage_copy(__isl_keep isl_id_list *ptr); +// declarations for isl::ast_expr_op_eq -class id_list { - friend inline id_list manage(__isl_take isl_id_list *ptr); - friend inline id_list manage_copy(__isl_keep isl_id_list *ptr); +class ast_expr_op_eq : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_eq ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_eq; - isl_id_list *ptr = nullptr; +protected: + inline explicit ast_expr_op_eq(__isl_take isl_ast_expr *ptr); - inline explicit id_list(__isl_take isl_id_list *ptr); +public: + inline /* implicit */ ast_expr_op_eq(); + inline /* implicit */ ast_expr_op_eq(const ast_expr_op_eq &obj); + inline ast_expr_op_eq &operator=(ast_expr_op_eq obj); + inline isl::ctx ctx() const; + +}; + +// declarations for isl::ast_expr_op_fdiv_q + +class ast_expr_op_fdiv_q : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_fdiv_q ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_fdiv_q; + +protected: + inline explicit ast_expr_op_fdiv_q(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ id_list(); - inline /* implicit */ id_list(const id_list &obj); - inline id_list &operator=(id_list obj); - inline ~id_list(); - inline __isl_give isl_id_list *copy() const &; - inline __isl_give isl_id_list *copy() && = delete; - inline __isl_keep isl_id_list *get() const; - inline __isl_give isl_id_list *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_fdiv_q(); + inline /* implicit */ ast_expr_op_fdiv_q(const ast_expr_op_fdiv_q &obj); + inline ast_expr_op_fdiv_q &operator=(ast_expr_op_fdiv_q obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::id_list add(isl::id el) const; - static inline isl::id_list alloc(isl::ctx ctx, int n); - inline isl::id_list clear() const; - inline isl::id_list concat(isl::id_list list2) const; - inline isl::id_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::id_list from_id(isl::id el); - inline isl::id get_at(int index) const; - inline isl::id get_id(int index) const; - inline isl::id_list insert(unsigned int pos, isl::id el) const; - inline isl_size n_id() const; - inline isl::id_list reverse() const; - inline isl::id_list set_id(int index, isl::id el) const; - inline isl_size size() const; - inline isl::id_list swap(unsigned int pos1, unsigned int pos2) const; }; -// declarations for isl::id_to_ast_expr -inline id_to_ast_expr manage(__isl_take isl_id_to_ast_expr *ptr); -inline id_to_ast_expr manage_copy(__isl_keep isl_id_to_ast_expr *ptr); +// declarations for isl::ast_expr_op_ge -class id_to_ast_expr { - friend inline id_to_ast_expr manage(__isl_take isl_id_to_ast_expr *ptr); - friend inline id_to_ast_expr manage_copy(__isl_keep isl_id_to_ast_expr *ptr); +class ast_expr_op_ge : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_ge ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_ge; - isl_id_to_ast_expr *ptr = nullptr; +protected: + inline explicit ast_expr_op_ge(__isl_take isl_ast_expr *ptr); - inline explicit id_to_ast_expr(__isl_take isl_id_to_ast_expr *ptr); +public: + inline /* implicit */ ast_expr_op_ge(); + inline /* implicit */ ast_expr_op_ge(const ast_expr_op_ge &obj); + inline ast_expr_op_ge &operator=(ast_expr_op_ge obj); + inline isl::ctx ctx() const; + +}; + +// declarations for isl::ast_expr_op_gt + +class ast_expr_op_gt : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_gt ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_gt; + +protected: + inline explicit ast_expr_op_gt(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ id_to_ast_expr(); - inline /* implicit */ id_to_ast_expr(const id_to_ast_expr &obj); - inline id_to_ast_expr &operator=(id_to_ast_expr obj); - inline ~id_to_ast_expr(); - inline __isl_give isl_id_to_ast_expr *copy() const &; - inline __isl_give isl_id_to_ast_expr *copy() && = delete; - inline __isl_keep isl_id_to_ast_expr *get() const; - inline __isl_give isl_id_to_ast_expr *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_gt(); + inline /* implicit */ ast_expr_op_gt(const ast_expr_op_gt &obj); + inline ast_expr_op_gt &operator=(ast_expr_op_gt obj); inline isl::ctx ctx() const; - inline void dump() const; - static inline isl::id_to_ast_expr alloc(isl::ctx ctx, int min_size); - inline isl::id_to_ast_expr drop(isl::id key) const; - inline stat foreach(const std::function &fn) const; - inline isl::ast_expr get(isl::id key) const; - inline boolean has(const isl::id &key) const; - inline isl::id_to_ast_expr set(isl::id key, isl::ast_expr val) const; }; -// declarations for isl::local_space -inline local_space manage(__isl_take isl_local_space *ptr); -inline local_space manage_copy(__isl_keep isl_local_space *ptr); +// declarations for isl::ast_expr_op_le -class local_space { - friend inline local_space manage(__isl_take isl_local_space *ptr); - friend inline local_space manage_copy(__isl_keep isl_local_space *ptr); +class ast_expr_op_le : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_le ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_le; - isl_local_space *ptr = nullptr; +protected: + inline explicit ast_expr_op_le(__isl_take isl_ast_expr *ptr); - inline explicit local_space(__isl_take isl_local_space *ptr); +public: + inline /* implicit */ ast_expr_op_le(); + inline /* implicit */ ast_expr_op_le(const ast_expr_op_le &obj); + inline ast_expr_op_le &operator=(ast_expr_op_le obj); + inline isl::ctx ctx() const; + +}; + +// declarations for isl::ast_expr_op_lt + +class ast_expr_op_lt : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_lt ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_lt; + +protected: + inline explicit ast_expr_op_lt(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ local_space(); - inline /* implicit */ local_space(const local_space &obj); - inline explicit local_space(isl::space space); - inline local_space &operator=(local_space obj); - inline ~local_space(); - inline __isl_give isl_local_space *copy() const &; - inline __isl_give isl_local_space *copy() && = delete; - inline __isl_keep isl_local_space *get() const; - inline __isl_give isl_local_space *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_lt(); + inline /* implicit */ ast_expr_op_lt(const ast_expr_op_lt &obj); + inline ast_expr_op_lt &operator=(ast_expr_op_lt obj); inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::local_space add_dims(isl::dim type, unsigned int n) const; - inline isl_size dim(isl::dim type) const; - inline isl::local_space domain() const; - inline isl::local_space drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::local_space flatten_domain() const; - inline isl::local_space flatten_range() const; - inline isl::local_space from_domain() const; - inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; - inline std::string get_dim_name(isl::dim type, unsigned int pos) const; - inline isl::aff get_div(int pos) const; - inline isl::space get_space() const; - inline boolean has_dim_id(isl::dim type, unsigned int pos) const; - inline boolean has_dim_name(isl::dim type, unsigned int pos) const; - inline isl::local_space insert_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::local_space intersect(isl::local_space ls2) const; - inline boolean is_equal(const isl::local_space &ls2) const; - inline boolean is_params() const; - inline boolean is_set() const; - inline isl::local_space range() const; - inline isl::local_space set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; - inline isl::local_space set_from_params() const; - inline isl::local_space set_tuple_id(isl::dim type, isl::id id) const; - inline isl::local_space wrap() const; + }; -// declarations for isl::map -inline map manage(__isl_take isl_map *ptr); -inline map manage_copy(__isl_keep isl_map *ptr); +// declarations for isl::ast_expr_op_max -class map { - friend inline map manage(__isl_take isl_map *ptr); - friend inline map manage_copy(__isl_keep isl_map *ptr); +class ast_expr_op_max : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_max ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_max; - isl_map *ptr = nullptr; +protected: + inline explicit ast_expr_op_max(__isl_take isl_ast_expr *ptr); - inline explicit map(__isl_take isl_map *ptr); +public: + inline /* implicit */ ast_expr_op_max(); + inline /* implicit */ ast_expr_op_max(const ast_expr_op_max &obj); + inline ast_expr_op_max &operator=(ast_expr_op_max obj); + inline isl::ctx ctx() const; + +}; + +// declarations for isl::ast_expr_op_member + +class ast_expr_op_member : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_member ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_member; + +protected: + inline explicit ast_expr_op_member(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ map(); - inline /* implicit */ map(const map &obj); - inline /* implicit */ map(isl::basic_map bmap); - inline explicit map(isl::ctx ctx, const std::string &str); - inline map &operator=(map obj); - inline ~map(); - inline __isl_give isl_map *copy() const &; - inline __isl_give isl_map *copy() && = delete; - inline __isl_keep isl_map *get() const; - inline __isl_give isl_map *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_member(); + inline /* implicit */ ast_expr_op_member(const ast_expr_op_member &obj); + inline ast_expr_op_member &operator=(ast_expr_op_member obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::map add_constraint(isl::constraint constraint) const; - inline isl::map add_dims(isl::dim type, unsigned int n) const; - inline isl::basic_map affine_hull() const; - inline isl::map align_params(isl::space model) const; - inline isl::map apply_domain(isl::map map2) const; - inline isl::map apply_range(isl::map map2) const; - inline isl::set bind_domain(isl::multi_id tuple) const; - inline isl::set bind_range(isl::multi_id tuple) const; - inline boolean can_curry() const; - inline boolean can_range_curry() const; - inline boolean can_uncurry() const; - inline boolean can_zip() const; - inline isl::map coalesce() const; - inline isl::map complement() const; - inline isl::basic_map convex_hull() const; - inline isl::map curry() const; - inline isl::set deltas() const; - inline isl::map deltas_map() const; - inline isl::map detect_equalities() const; - inline isl_size dim(isl::dim type) const; - inline isl::pw_aff dim_max(int pos) const; - inline isl::pw_aff dim_min(int pos) const; - inline isl::set domain() const; - inline isl::map domain_factor_domain() const; - inline isl::map domain_factor_range() const; - inline boolean domain_is_wrapping() const; - inline isl::map domain_map() const; - inline isl::map domain_product(isl::map map2) const; - inline isl_size domain_tuple_dim() const; - inline isl::map drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::map drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::map drop_unused_params() const; - inline isl::map eliminate(isl::dim type, unsigned int first, unsigned int n) const; - static inline isl::map empty(isl::space space); - inline isl::map eq_at(isl::multi_pw_aff mpa) const; - inline isl::map equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const; - inline isl::map factor_domain() const; - inline isl::map factor_range() const; - inline int find_dim_by_id(isl::dim type, const isl::id &id) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::map fix_si(isl::dim type, unsigned int pos, int value) const; - inline isl::map fix_val(isl::dim type, unsigned int pos, isl::val v) const; - inline isl::map fixed_power_val(isl::val exp) const; - inline isl::map flat_domain_product(isl::map map2) const; - inline isl::map flat_product(isl::map map2) const; - inline isl::map flat_range_product(isl::map map2) const; - inline isl::map flatten() const; - inline isl::map flatten_domain() const; - inline isl::map flatten_range() const; - inline isl::map floordiv_val(isl::val d) const; - inline stat foreach_basic_map(const std::function &fn) const; - static inline isl::map from_aff(isl::aff aff); - static inline isl::map from_domain(isl::set set); - static inline isl::map from_domain_and_range(isl::set domain, isl::set range); - static inline isl::map from_multi_aff(isl::multi_aff maff); - static inline isl::map from_multi_pw_aff(isl::multi_pw_aff mpa); - static inline isl::map from_pw_aff(isl::pw_aff pwaff); - static inline isl::map from_pw_multi_aff(isl::pw_multi_aff pma); - static inline isl::map from_range(isl::set set); - static inline isl::map from_union_map(isl::union_map umap); - inline isl::basic_map_list get_basic_map_list() const; - inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; - inline std::string get_dim_name(isl::dim type, unsigned int pos) const; - inline uint32_t get_hash() const; - inline isl::fixed_box get_range_simple_fixed_box_hull() const; - inline isl::space get_space() const; - inline isl::id get_tuple_id(isl::dim type) const; - inline std::string get_tuple_name(isl::dim type) const; - inline isl::map gist(isl::map context) const; - inline isl::map gist_basic_map(isl::basic_map context) const; - inline isl::map gist_domain(isl::set context) const; - inline isl::map gist_params(isl::set context) const; - inline isl::map gist_range(isl::set context) const; - inline boolean has_dim_id(isl::dim type, unsigned int pos) const; - inline boolean has_dim_name(isl::dim type, unsigned int pos) const; - inline boolean has_equal_space(const isl::map &map2) const; - inline boolean has_tuple_id(isl::dim type) const; - inline boolean has_tuple_name(isl::dim type) const; - static inline isl::map identity(isl::space space); - inline isl::map insert_dims(isl::dim type, unsigned int pos, unsigned int n) const; - inline isl::map intersect(isl::map map2) const; - inline isl::map intersect_domain(isl::set set) const; - inline isl::map intersect_domain_factor_domain(isl::map factor) const; - inline isl::map intersect_domain_factor_range(isl::map factor) const; - inline isl::map intersect_params(isl::set params) const; - inline isl::map intersect_range(isl::set set) const; - inline isl::map intersect_range_factor_domain(isl::map factor) const; - inline isl::map intersect_range_factor_range(isl::map factor) const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean is_bijective() const; - inline boolean is_disjoint(const isl::map &map2) const; - inline boolean is_empty() const; - inline boolean is_equal(const isl::map &map2) const; - inline boolean is_identity() const; - inline boolean is_injective() const; - inline boolean is_product() const; - inline boolean is_single_valued() const; - inline boolean is_strict_subset(const isl::map &map2) const; - inline boolean is_subset(const isl::map &map2) const; - inline int is_translation() const; - static inline isl::map lex_ge(isl::space set_space); - inline isl::map lex_ge_at(isl::multi_pw_aff mpa) const; - static inline isl::map lex_ge_first(isl::space space, unsigned int n); - inline isl::map lex_ge_map(isl::map map2) const; - static inline isl::map lex_gt(isl::space set_space); - inline isl::map lex_gt_at(isl::multi_pw_aff mpa) const; - static inline isl::map lex_gt_first(isl::space space, unsigned int n); - inline isl::map lex_gt_map(isl::map map2) const; - static inline isl::map lex_le(isl::space set_space); - inline isl::map lex_le_at(isl::multi_pw_aff mpa) const; - static inline isl::map lex_le_first(isl::space space, unsigned int n); - inline isl::map lex_le_map(isl::map map2) const; - static inline isl::map lex_lt(isl::space set_space); - inline isl::map lex_lt_at(isl::multi_pw_aff mpa) const; - static inline isl::map lex_lt_first(isl::space space, unsigned int n); - inline isl::map lex_lt_map(isl::map map2) const; - inline isl::map lexmax() const; - inline isl::pw_multi_aff lexmax_pw_multi_aff() const; - inline isl::map lexmin() const; - inline isl::pw_multi_aff lexmin_pw_multi_aff() const; - inline isl::map lower_bound(isl::multi_pw_aff lower) const; - inline isl::map lower_bound_si(isl::dim type, unsigned int pos, int value) const; - inline isl::map lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const; - inline isl::multi_pw_aff max_multi_pw_aff() const; - inline isl::multi_pw_aff min_multi_pw_aff() const; - inline isl::map move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; - inline isl_size n_basic_map() const; - static inline isl::map nat_universe(isl::space space); - inline isl::map neg() const; - inline isl::map oppose(isl::dim type1, int pos1, isl::dim type2, int pos2) const; - inline isl::map order_ge(isl::dim type1, int pos1, isl::dim type2, int pos2) const; - inline isl::map order_gt(isl::dim type1, int pos1, isl::dim type2, int pos2) const; - inline isl::map order_le(isl::dim type1, int pos1, isl::dim type2, int pos2) const; - inline isl::map order_lt(isl::dim type1, int pos1, isl::dim type2, int pos2) const; - inline isl::set params() const; - inline isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const; - inline boolean plain_is_empty() const; - inline boolean plain_is_equal(const isl::map &map2) const; - inline boolean plain_is_injective() const; - inline boolean plain_is_single_valued() const; - inline boolean plain_is_universe() const; - inline isl::basic_map plain_unshifted_simple_hull() const; - inline isl::basic_map polyhedral_hull() const; - inline isl::map preimage_domain(isl::multi_aff ma) const; - inline isl::map preimage_domain(isl::multi_pw_aff mpa) const; - inline isl::map preimage_domain(isl::pw_multi_aff pma) const; - inline isl::map preimage_range(isl::multi_aff ma) const; - inline isl::map preimage_range(isl::pw_multi_aff pma) const; - inline isl::map product(isl::map map2) const; - inline isl::map project_out(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::map project_out_all_params() const; - inline isl::set range() const; - inline isl::map range_curry() const; - inline isl::map range_factor_domain() const; - inline isl::map range_factor_range() const; - inline boolean range_is_wrapping() const; - inline isl::map range_map() const; - inline isl::map range_product(isl::map map2) const; - inline isl::map range_reverse() const; - inline isl_size range_tuple_dim() const; - inline isl::map remove_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::map remove_divs() const; - inline isl::map remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::map remove_redundancies() const; - inline isl::map remove_unknown_divs() const; - inline isl::map reset_tuple_id(isl::dim type) const; - inline isl::map reset_user() const; - inline isl::map reverse() const; - inline isl::basic_map sample() const; - inline isl::map set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; - inline isl::map set_tuple_id(isl::dim type, isl::id id) const; - inline isl::map set_tuple_name(isl::dim type, const std::string &s) const; - inline isl::basic_map simple_hull() const; - inline isl::map subtract(isl::map map2) const; - inline isl::map subtract_domain(isl::set dom) const; - inline isl::map subtract_range(isl::set dom) const; - inline isl::map sum(isl::map map2) const; - inline isl::map uncurry() const; - inline isl::map unite(isl::map map2) const; - static inline isl::map universe(isl::space space); - inline isl::basic_map unshifted_simple_hull() const; - inline isl::basic_map unshifted_simple_hull_from_map_list(isl::map_list list) const; - inline isl::map upper_bound(isl::multi_pw_aff upper) const; - inline isl::map upper_bound_si(isl::dim type, unsigned int pos, int value) const; - inline isl::map upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const; - inline isl::set wrap() const; - inline isl::map zip() const; }; -// declarations for isl::map_list -inline map_list manage(__isl_take isl_map_list *ptr); -inline map_list manage_copy(__isl_keep isl_map_list *ptr); - -class map_list { - friend inline map_list manage(__isl_take isl_map_list *ptr); - friend inline map_list manage_copy(__isl_keep isl_map_list *ptr); +// declarations for isl::ast_expr_op_min - isl_map_list *ptr = nullptr; +class ast_expr_op_min : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_min ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_min; - inline explicit map_list(__isl_take isl_map_list *ptr); +protected: + inline explicit ast_expr_op_min(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ map_list(); - inline /* implicit */ map_list(const map_list &obj); - inline map_list &operator=(map_list obj); - inline ~map_list(); - inline __isl_give isl_map_list *copy() const &; - inline __isl_give isl_map_list *copy() && = delete; - inline __isl_keep isl_map_list *get() const; - inline __isl_give isl_map_list *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_min(); + inline /* implicit */ ast_expr_op_min(const ast_expr_op_min &obj); + inline ast_expr_op_min &operator=(ast_expr_op_min obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::map_list add(isl::map el) const; - static inline isl::map_list alloc(isl::ctx ctx, int n); - inline isl::map_list clear() const; - inline isl::map_list concat(isl::map_list list2) const; - inline isl::map_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::map_list from_map(isl::map el); - inline isl::map get_at(int index) const; - inline isl::map get_map(int index) const; - inline isl::map_list insert(unsigned int pos, isl::map el) const; - inline isl_size n_map() const; - inline isl::map_list reverse() const; - inline isl::map_list set_map(int index, isl::map el) const; - inline isl_size size() const; - inline isl::map_list swap(unsigned int pos1, unsigned int pos2) const; }; -// declarations for isl::mat -inline mat manage(__isl_take isl_mat *ptr); -inline mat manage_copy(__isl_keep isl_mat *ptr); +// declarations for isl::ast_expr_op_minus -class mat { - friend inline mat manage(__isl_take isl_mat *ptr); - friend inline mat manage_copy(__isl_keep isl_mat *ptr); +class ast_expr_op_minus : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_minus ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_minus; - isl_mat *ptr = nullptr; - - inline explicit mat(__isl_take isl_mat *ptr); +protected: + inline explicit ast_expr_op_minus(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ mat(); - inline /* implicit */ mat(const mat &obj); - inline mat &operator=(mat obj); - inline ~mat(); - inline __isl_give isl_mat *copy() const &; - inline __isl_give isl_mat *copy() && = delete; - inline __isl_keep isl_mat *get() const; - inline __isl_give isl_mat *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_minus(); + inline /* implicit */ ast_expr_op_minus(const ast_expr_op_minus &obj); + inline ast_expr_op_minus &operator=(ast_expr_op_minus obj); inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::mat add_rows(unsigned int n) const; - inline isl::mat add_zero_cols(unsigned int n) const; - inline isl::mat add_zero_rows(unsigned int n) const; - inline isl::mat aff_direct_sum(isl::mat right) const; - static inline isl::mat alloc(isl::ctx ctx, unsigned int n_row, unsigned int n_col); - inline isl_size cols() const; - inline isl::mat concat(isl::mat bot) const; - inline isl::mat diagonal(isl::mat mat2) const; - inline isl::mat drop_cols(unsigned int col, unsigned int n) const; - inline isl::mat drop_rows(unsigned int row, unsigned int n) const; - static inline isl::mat from_row_vec(isl::vec vec); - inline isl::val get_element_val(int row, int col) const; - inline boolean has_linearly_independent_rows(const isl::mat &mat2) const; - inline int initial_non_zero_cols() const; - inline isl::mat insert_cols(unsigned int col, unsigned int n) const; - inline isl::mat insert_rows(unsigned int row, unsigned int n) const; - inline isl::mat insert_zero_cols(unsigned int first, unsigned int n) const; - inline isl::mat insert_zero_rows(unsigned int row, unsigned int n) const; - inline isl::mat inverse_product(isl::mat right) const; - inline boolean is_equal(const isl::mat &mat2) const; - inline isl::mat lin_to_aff() const; - inline isl::mat move_cols(unsigned int dst_col, unsigned int src_col, unsigned int n) const; - inline isl::mat normalize() const; - inline isl::mat normalize_row(int row) const; - inline isl::mat product(isl::mat right) const; - inline isl_size rank() const; - inline isl::mat right_inverse() const; - inline isl::mat right_kernel() const; - inline isl::mat row_basis() const; - inline isl::mat row_basis_extension(isl::mat mat2) const; - inline isl_size rows() const; - inline isl::mat set_element_si(int row, int col, int v) const; - inline isl::mat set_element_val(int row, int col, isl::val v) const; - inline isl::mat swap_cols(unsigned int i, unsigned int j) const; - inline isl::mat swap_rows(unsigned int i, unsigned int j) const; - inline isl::mat transpose() const; - inline isl::mat unimodular_complete(int row) const; - inline isl::mat vec_concat(isl::vec bot) const; - inline isl::vec vec_inverse_product(isl::vec vec) const; - inline isl::vec vec_product(isl::vec vec) const; -}; -// declarations for isl::multi_aff -inline multi_aff manage(__isl_take isl_multi_aff *ptr); -inline multi_aff manage_copy(__isl_keep isl_multi_aff *ptr); +}; -class multi_aff { - friend inline multi_aff manage(__isl_take isl_multi_aff *ptr); - friend inline multi_aff manage_copy(__isl_keep isl_multi_aff *ptr); +// declarations for isl::ast_expr_op_mul - isl_multi_aff *ptr = nullptr; +class ast_expr_op_mul : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_mul ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_mul; - inline explicit multi_aff(__isl_take isl_multi_aff *ptr); +protected: + inline explicit ast_expr_op_mul(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ multi_aff(); - inline /* implicit */ multi_aff(const multi_aff &obj); - inline /* implicit */ multi_aff(isl::aff aff); - inline explicit multi_aff(isl::space space, isl::aff_list list); - inline explicit multi_aff(isl::ctx ctx, const std::string &str); - inline multi_aff &operator=(multi_aff obj); - inline ~multi_aff(); - inline __isl_give isl_multi_aff *copy() const &; - inline __isl_give isl_multi_aff *copy() && = delete; - inline __isl_keep isl_multi_aff *get() const; - inline __isl_give isl_multi_aff *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_mul(); + inline /* implicit */ ast_expr_op_mul(const ast_expr_op_mul &obj); + inline ast_expr_op_mul &operator=(ast_expr_op_mul obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::multi_aff add(isl::multi_aff multi2) const; - inline isl::multi_aff add_constant(isl::multi_val mv) const; - inline isl::multi_aff add_constant(isl::val v) const; - inline isl::multi_aff add_dims(isl::dim type, unsigned int n) const; - inline isl::multi_aff align_params(isl::space model) const; - inline isl::basic_set bind(isl::multi_id tuple) const; - inline isl::multi_aff bind_domain(isl::multi_id tuple) const; - inline isl::multi_aff bind_domain_wrapped_domain(isl::multi_id tuple) const; - inline isl_size dim(isl::dim type) const; - static inline isl::multi_aff domain_map(isl::space space); - inline isl::multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::multi_aff factor_range() const; - inline int find_dim_by_id(isl::dim type, const isl::id &id) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::multi_aff flat_range_product(isl::multi_aff multi2) const; - inline isl::multi_aff flatten_domain() const; - inline isl::multi_aff flatten_range() const; - inline isl::multi_aff floor() const; - inline isl::multi_aff from_range() const; - inline isl::aff get_aff(int pos) const; - inline isl::aff get_at(int pos) const; - inline isl::multi_val get_constant_multi_val() const; - inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; - inline isl::space get_domain_space() const; - inline isl::aff_list get_list() const; - inline isl::space get_space() const; - inline isl::id get_tuple_id(isl::dim type) const; - inline std::string get_tuple_name(isl::dim type) const; - inline isl::multi_aff gist(isl::set context) const; - inline isl::multi_aff gist_params(isl::set context) const; - inline boolean has_tuple_id(isl::dim type) const; - static inline isl::multi_aff identity(isl::space space); - inline isl::multi_aff identity() const; - static inline isl::multi_aff identity_on_domain(isl::space space); - inline isl::multi_aff insert_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::multi_aff insert_domain(isl::space domain) const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean involves_locals() const; - inline boolean involves_nan() const; - inline isl::set lex_ge_set(isl::multi_aff ma2) const; - inline isl::set lex_gt_set(isl::multi_aff ma2) const; - inline isl::set lex_le_set(isl::multi_aff ma2) const; - inline isl::set lex_lt_set(isl::multi_aff ma2) const; - inline isl::multi_aff mod_multi_val(isl::multi_val mv) const; - inline isl::multi_aff move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; - static inline isl::multi_aff multi_val_on_space(isl::space space, isl::multi_val mv); - inline isl::multi_aff neg() const; - inline int plain_cmp(const isl::multi_aff &multi2) const; - inline boolean plain_is_equal(const isl::multi_aff &multi2) const; - inline isl::multi_aff product(isl::multi_aff multi2) const; - inline isl::multi_aff project_domain_on_params() const; - static inline isl::multi_aff project_out_map(isl::space space, isl::dim type, unsigned int first, unsigned int n); - inline isl::multi_aff pullback(isl::multi_aff ma2) const; - inline isl::multi_aff range_factor_domain() const; - inline isl::multi_aff range_factor_range() const; - inline boolean range_is_wrapping() const; - static inline isl::multi_aff range_map(isl::space space); - inline isl::multi_aff range_product(isl::multi_aff multi2) const; - inline isl::multi_aff range_splice(unsigned int pos, isl::multi_aff multi2) const; - inline isl::multi_aff reset_tuple_id(isl::dim type) const; - inline isl::multi_aff reset_user() const; - inline isl::multi_aff scale(isl::multi_val mv) const; - inline isl::multi_aff scale(isl::val v) const; - inline isl::multi_aff scale_down(isl::multi_val mv) const; - inline isl::multi_aff scale_down(isl::val v) const; - inline isl::multi_aff set_aff(int pos, isl::aff el) const; - inline isl::multi_aff set_at(int pos, isl::aff el) const; - inline isl::multi_aff set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; - inline isl::multi_aff set_tuple_id(isl::dim type, isl::id id) const; - inline isl::multi_aff set_tuple_name(isl::dim type, const std::string &s) const; - inline isl_size size() const; - inline isl::multi_aff splice(unsigned int in_pos, unsigned int out_pos, isl::multi_aff multi2) const; - inline isl::multi_aff sub(isl::multi_aff multi2) const; - inline isl::multi_aff unbind_params_insert_domain(isl::multi_id domain) const; - static inline isl::multi_aff zero(isl::space space); }; -// declarations for isl::multi_id -inline multi_id manage(__isl_take isl_multi_id *ptr); -inline multi_id manage_copy(__isl_keep isl_multi_id *ptr); - -class multi_id { - friend inline multi_id manage(__isl_take isl_multi_id *ptr); - friend inline multi_id manage_copy(__isl_keep isl_multi_id *ptr); +// declarations for isl::ast_expr_op_or - isl_multi_id *ptr = nullptr; +class ast_expr_op_or : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_or ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_or; - inline explicit multi_id(__isl_take isl_multi_id *ptr); +protected: + inline explicit ast_expr_op_or(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ multi_id(); - inline /* implicit */ multi_id(const multi_id &obj); - inline explicit multi_id(isl::space space, isl::id_list list); - inline explicit multi_id(isl::ctx ctx, const std::string &str); - inline multi_id &operator=(multi_id obj); - inline ~multi_id(); - inline __isl_give isl_multi_id *copy() const &; - inline __isl_give isl_multi_id *copy() && = delete; - inline __isl_keep isl_multi_id *get() const; - inline __isl_give isl_multi_id *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_or(); + inline /* implicit */ ast_expr_op_or(const ast_expr_op_or &obj); + inline ast_expr_op_or &operator=(ast_expr_op_or obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::multi_id align_params(isl::space model) const; - inline isl::multi_id factor_range() const; - inline isl::multi_id flat_range_product(isl::multi_id multi2) const; - inline isl::multi_id flatten_range() const; - inline isl::multi_id from_range() const; - inline isl::id get_at(int pos) const; - inline isl::space get_domain_space() const; - inline isl::id get_id(int pos) const; - inline isl::id_list get_list() const; - inline isl::space get_space() const; - inline boolean plain_is_equal(const isl::multi_id &multi2) const; - inline isl::multi_id range_factor_domain() const; - inline isl::multi_id range_factor_range() const; - inline boolean range_is_wrapping() const; - inline isl::multi_id range_product(isl::multi_id multi2) const; - inline isl::multi_id range_splice(unsigned int pos, isl::multi_id multi2) const; - inline isl::multi_id reset_user() const; - inline isl::multi_id set_at(int pos, isl::id el) const; - inline isl::multi_id set_id(int pos, isl::id el) const; - inline isl_size size() const; }; -// declarations for isl::multi_pw_aff -inline multi_pw_aff manage(__isl_take isl_multi_pw_aff *ptr); -inline multi_pw_aff manage_copy(__isl_keep isl_multi_pw_aff *ptr); +// declarations for isl::ast_expr_op_or_else -class multi_pw_aff { - friend inline multi_pw_aff manage(__isl_take isl_multi_pw_aff *ptr); - friend inline multi_pw_aff manage_copy(__isl_keep isl_multi_pw_aff *ptr); +class ast_expr_op_or_else : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_or_else ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_or_else; - isl_multi_pw_aff *ptr = nullptr; +protected: + inline explicit ast_expr_op_or_else(__isl_take isl_ast_expr *ptr); - inline explicit multi_pw_aff(__isl_take isl_multi_pw_aff *ptr); +public: + inline /* implicit */ ast_expr_op_or_else(); + inline /* implicit */ ast_expr_op_or_else(const ast_expr_op_or_else &obj); + inline ast_expr_op_or_else &operator=(ast_expr_op_or_else obj); + inline isl::ctx ctx() const; + +}; + +// declarations for isl::ast_expr_op_pdiv_q + +class ast_expr_op_pdiv_q : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_pdiv_q ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_pdiv_q; + +protected: + inline explicit ast_expr_op_pdiv_q(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ multi_pw_aff(); - inline /* implicit */ multi_pw_aff(const multi_pw_aff &obj); - inline /* implicit */ multi_pw_aff(isl::aff aff); - inline /* implicit */ multi_pw_aff(isl::multi_aff ma); - inline /* implicit */ multi_pw_aff(isl::pw_aff pa); - inline explicit multi_pw_aff(isl::space space, isl::pw_aff_list list); - inline /* implicit */ multi_pw_aff(isl::pw_multi_aff pma); - inline explicit multi_pw_aff(isl::ctx ctx, const std::string &str); - inline multi_pw_aff &operator=(multi_pw_aff obj); - inline ~multi_pw_aff(); - inline __isl_give isl_multi_pw_aff *copy() const &; - inline __isl_give isl_multi_pw_aff *copy() && = delete; - inline __isl_keep isl_multi_pw_aff *get() const; - inline __isl_give isl_multi_pw_aff *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_pdiv_q(); + inline /* implicit */ ast_expr_op_pdiv_q(const ast_expr_op_pdiv_q &obj); + inline ast_expr_op_pdiv_q &operator=(ast_expr_op_pdiv_q obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::multi_pw_aff add(isl::multi_pw_aff multi2) const; - inline isl::multi_pw_aff add_constant(isl::multi_val mv) const; - inline isl::multi_pw_aff add_constant(isl::val v) const; - inline isl::multi_pw_aff add_dims(isl::dim type, unsigned int n) const; - inline isl::multi_pw_aff align_params(isl::space model) const; - inline isl::set bind(isl::multi_id tuple) const; - inline isl::multi_pw_aff bind_domain(isl::multi_id tuple) const; - inline isl::multi_pw_aff bind_domain_wrapped_domain(isl::multi_id tuple) const; - inline isl::multi_pw_aff coalesce() const; - inline isl_size dim(isl::dim type) const; - inline isl::set domain() const; - inline isl::multi_pw_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::map eq_map(isl::multi_pw_aff mpa2) const; - inline isl::multi_pw_aff factor_range() const; - inline int find_dim_by_id(isl::dim type, const isl::id &id) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::multi_pw_aff flat_range_product(isl::multi_pw_aff multi2) const; - inline isl::multi_pw_aff flatten_range() const; - inline isl::multi_pw_aff from_range() const; - inline isl::pw_aff get_at(int pos) const; - inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; - inline isl::space get_domain_space() const; - inline uint32_t get_hash() const; - inline isl::pw_aff_list get_list() const; - inline isl::pw_aff get_pw_aff(int pos) const; - inline isl::space get_space() const; - inline isl::id get_tuple_id(isl::dim type) const; - inline std::string get_tuple_name(isl::dim type) const; - inline isl::multi_pw_aff gist(isl::set set) const; - inline isl::multi_pw_aff gist_params(isl::set set) const; - inline boolean has_tuple_id(isl::dim type) const; - static inline isl::multi_pw_aff identity(isl::space space); - inline isl::multi_pw_aff identity() const; - static inline isl::multi_pw_aff identity_on_domain(isl::space space); - inline isl::multi_pw_aff insert_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::multi_pw_aff insert_domain(isl::space domain) const; - inline isl::multi_pw_aff intersect_domain(isl::set domain) const; - inline isl::multi_pw_aff intersect_params(isl::set set) const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean involves_nan() const; - inline boolean involves_param(const isl::id &id) const; - inline boolean involves_param(const isl::id_list &list) const; - inline boolean is_cst() const; - inline boolean is_equal(const isl::multi_pw_aff &mpa2) const; - inline isl::map lex_ge_map(isl::multi_pw_aff mpa2) const; - inline isl::map lex_gt_map(isl::multi_pw_aff mpa2) const; - inline isl::map lex_le_map(isl::multi_pw_aff mpa2) const; - inline isl::map lex_lt_map(isl::multi_pw_aff mpa2) const; - inline isl::multi_pw_aff max(isl::multi_pw_aff multi2) const; - inline isl::multi_val max_multi_val() const; - inline isl::multi_pw_aff min(isl::multi_pw_aff multi2) const; - inline isl::multi_val min_multi_val() const; - inline isl::multi_pw_aff mod_multi_val(isl::multi_val mv) const; - inline isl::multi_pw_aff move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; - inline isl::multi_pw_aff neg() const; - inline boolean plain_is_equal(const isl::multi_pw_aff &multi2) const; - inline isl::multi_pw_aff product(isl::multi_pw_aff multi2) const; - inline isl::multi_pw_aff project_domain_on_params() const; - inline isl::multi_pw_aff pullback(isl::multi_aff ma) const; - inline isl::multi_pw_aff pullback(isl::multi_pw_aff mpa2) const; - inline isl::multi_pw_aff pullback(isl::pw_multi_aff pma) const; - inline isl::multi_pw_aff range_factor_domain() const; - inline isl::multi_pw_aff range_factor_range() const; - inline boolean range_is_wrapping() const; - inline isl::multi_pw_aff range_product(isl::multi_pw_aff multi2) const; - inline isl::multi_pw_aff range_splice(unsigned int pos, isl::multi_pw_aff multi2) const; - inline isl::multi_pw_aff reset_tuple_id(isl::dim type) const; - inline isl::multi_pw_aff reset_user() const; - inline isl::multi_pw_aff scale(isl::multi_val mv) const; - inline isl::multi_pw_aff scale(isl::val v) const; - inline isl::multi_pw_aff scale_down(isl::multi_val mv) const; - inline isl::multi_pw_aff scale_down(isl::val v) const; - inline isl::multi_pw_aff set_at(int pos, isl::pw_aff el) const; - inline isl::multi_pw_aff set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; - inline isl::multi_pw_aff set_pw_aff(int pos, isl::pw_aff el) const; - inline isl::multi_pw_aff set_tuple_id(isl::dim type, isl::id id) const; - inline isl::multi_pw_aff set_tuple_name(isl::dim type, const std::string &s) const; - inline isl_size size() const; - inline isl::multi_pw_aff splice(unsigned int in_pos, unsigned int out_pos, isl::multi_pw_aff multi2) const; - inline isl::multi_pw_aff sub(isl::multi_pw_aff multi2) const; - inline isl::multi_pw_aff unbind_params_insert_domain(isl::multi_id domain) const; - inline isl::multi_pw_aff union_add(isl::multi_pw_aff mpa2) const; - static inline isl::multi_pw_aff zero(isl::space space); }; -// declarations for isl::multi_union_pw_aff -inline multi_union_pw_aff manage(__isl_take isl_multi_union_pw_aff *ptr); -inline multi_union_pw_aff manage_copy(__isl_keep isl_multi_union_pw_aff *ptr); +// declarations for isl::ast_expr_op_pdiv_r -class multi_union_pw_aff { - friend inline multi_union_pw_aff manage(__isl_take isl_multi_union_pw_aff *ptr); - friend inline multi_union_pw_aff manage_copy(__isl_keep isl_multi_union_pw_aff *ptr); +class ast_expr_op_pdiv_r : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_pdiv_r ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_pdiv_r; - isl_multi_union_pw_aff *ptr = nullptr; +protected: + inline explicit ast_expr_op_pdiv_r(__isl_take isl_ast_expr *ptr); - inline explicit multi_union_pw_aff(__isl_take isl_multi_union_pw_aff *ptr); +public: + inline /* implicit */ ast_expr_op_pdiv_r(); + inline /* implicit */ ast_expr_op_pdiv_r(const ast_expr_op_pdiv_r &obj); + inline ast_expr_op_pdiv_r &operator=(ast_expr_op_pdiv_r obj); + inline isl::ctx ctx() const; + +}; + +// declarations for isl::ast_expr_op_select + +class ast_expr_op_select : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_select ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_select; + +protected: + inline explicit ast_expr_op_select(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ multi_union_pw_aff(); - inline /* implicit */ multi_union_pw_aff(const multi_union_pw_aff &obj); - inline /* implicit */ multi_union_pw_aff(isl::multi_pw_aff mpa); - inline /* implicit */ multi_union_pw_aff(isl::union_pw_aff upa); - inline explicit multi_union_pw_aff(isl::space space, isl::union_pw_aff_list list); - inline explicit multi_union_pw_aff(isl::union_pw_multi_aff upma); - inline explicit multi_union_pw_aff(isl::ctx ctx, const std::string &str); - inline multi_union_pw_aff &operator=(multi_union_pw_aff obj); - inline ~multi_union_pw_aff(); - inline __isl_give isl_multi_union_pw_aff *copy() const &; - inline __isl_give isl_multi_union_pw_aff *copy() && = delete; - inline __isl_keep isl_multi_union_pw_aff *get() const; - inline __isl_give isl_multi_union_pw_aff *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_select(); + inline /* implicit */ ast_expr_op_select(const ast_expr_op_select &obj); + inline ast_expr_op_select &operator=(ast_expr_op_select obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::multi_union_pw_aff add(isl::multi_union_pw_aff multi2) const; - inline isl::multi_union_pw_aff align_params(isl::space model) const; - inline isl::union_pw_aff apply_aff(isl::aff aff) const; - inline isl::union_pw_aff apply_pw_aff(isl::pw_aff pa) const; - inline isl::multi_union_pw_aff apply_pw_multi_aff(isl::pw_multi_aff pma) const; - inline isl::union_set bind(isl::multi_id tuple) const; - inline isl::multi_union_pw_aff coalesce() const; - inline isl_size dim(isl::dim type) const; - inline isl::union_set domain() const; - inline isl::multi_union_pw_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::multi_pw_aff extract_multi_pw_aff(isl::space space) const; - inline isl::multi_union_pw_aff factor_range() const; - inline int find_dim_by_id(isl::dim type, const isl::id &id) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::multi_union_pw_aff flat_range_product(isl::multi_union_pw_aff multi2) const; - inline isl::multi_union_pw_aff flatten_range() const; - inline isl::multi_union_pw_aff floor() const; - static inline isl::multi_union_pw_aff from_multi_aff(isl::multi_aff ma); - inline isl::multi_union_pw_aff from_range() const; - static inline isl::multi_union_pw_aff from_union_map(isl::union_map umap); - inline isl::union_pw_aff get_at(int pos) const; - inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; - inline isl::space get_domain_space() const; - inline isl::union_pw_aff_list get_list() const; - inline isl::space get_space() const; - inline isl::id get_tuple_id(isl::dim type) const; - inline std::string get_tuple_name(isl::dim type) const; - inline isl::union_pw_aff get_union_pw_aff(int pos) const; - inline isl::multi_union_pw_aff gist(isl::union_set context) const; - inline isl::multi_union_pw_aff gist_params(isl::set context) const; - inline boolean has_tuple_id(isl::dim type) const; - inline isl::multi_union_pw_aff intersect_domain(isl::union_set uset) const; - inline isl::multi_union_pw_aff intersect_params(isl::set params) const; - inline isl::multi_union_pw_aff intersect_range(isl::set set) const; - inline boolean involves_nan() const; - inline isl::multi_val max_multi_val() const; - inline isl::multi_val min_multi_val() const; - inline isl::multi_union_pw_aff mod_multi_val(isl::multi_val mv) const; - static inline isl::multi_union_pw_aff multi_aff_on_domain(isl::union_set domain, isl::multi_aff ma); - static inline isl::multi_union_pw_aff multi_val_on_domain(isl::union_set domain, isl::multi_val mv); - inline isl::multi_union_pw_aff neg() const; - inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const; - inline isl::multi_union_pw_aff pullback(isl::union_pw_multi_aff upma) const; - static inline isl::multi_union_pw_aff pw_multi_aff_on_domain(isl::union_set domain, isl::pw_multi_aff pma); - inline isl::multi_union_pw_aff range_factor_domain() const; - inline isl::multi_union_pw_aff range_factor_range() const; - inline boolean range_is_wrapping() const; - inline isl::multi_union_pw_aff range_product(isl::multi_union_pw_aff multi2) const; - inline isl::multi_union_pw_aff range_splice(unsigned int pos, isl::multi_union_pw_aff multi2) const; - inline isl::multi_union_pw_aff reset_tuple_id(isl::dim type) const; - inline isl::multi_union_pw_aff reset_user() const; - inline isl::multi_union_pw_aff scale(isl::multi_val mv) const; - inline isl::multi_union_pw_aff scale(isl::val v) const; - inline isl::multi_union_pw_aff scale_down(isl::multi_val mv) const; - inline isl::multi_union_pw_aff scale_down(isl::val v) const; - inline isl::multi_union_pw_aff set_at(int pos, isl::union_pw_aff el) const; - inline isl::multi_union_pw_aff set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; - inline isl::multi_union_pw_aff set_tuple_id(isl::dim type, isl::id id) const; - inline isl::multi_union_pw_aff set_tuple_name(isl::dim type, const std::string &s) const; - inline isl::multi_union_pw_aff set_union_pw_aff(int pos, isl::union_pw_aff el) const; - inline isl_size size() const; - inline isl::multi_union_pw_aff sub(isl::multi_union_pw_aff multi2) const; - inline isl::multi_union_pw_aff union_add(isl::multi_union_pw_aff mupa2) const; - static inline isl::multi_union_pw_aff zero(isl::space space); - inline isl::union_set zero_union_set() const; }; -// declarations for isl::multi_val -inline multi_val manage(__isl_take isl_multi_val *ptr); -inline multi_val manage_copy(__isl_keep isl_multi_val *ptr); +// declarations for isl::ast_expr_op_sub -class multi_val { - friend inline multi_val manage(__isl_take isl_multi_val *ptr); - friend inline multi_val manage_copy(__isl_keep isl_multi_val *ptr); +class ast_expr_op_sub : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_sub ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_sub; - isl_multi_val *ptr = nullptr; +protected: + inline explicit ast_expr_op_sub(__isl_take isl_ast_expr *ptr); - inline explicit multi_val(__isl_take isl_multi_val *ptr); +public: + inline /* implicit */ ast_expr_op_sub(); + inline /* implicit */ ast_expr_op_sub(const ast_expr_op_sub &obj); + inline ast_expr_op_sub &operator=(ast_expr_op_sub obj); + inline isl::ctx ctx() const; + +}; + +// declarations for isl::ast_expr_op_zdiv_r + +class ast_expr_op_zdiv_r : public ast_expr_op { + template + friend boolean ast_expr_op::isa() const; + friend ast_expr_op_zdiv_r ast_expr_op::as() const; + static const auto type = isl_ast_expr_op_zdiv_r; + +protected: + inline explicit ast_expr_op_zdiv_r(__isl_take isl_ast_expr *ptr); public: - inline /* implicit */ multi_val(); - inline /* implicit */ multi_val(const multi_val &obj); - inline explicit multi_val(isl::space space, isl::val_list list); - inline explicit multi_val(isl::ctx ctx, const std::string &str); - inline multi_val &operator=(multi_val obj); - inline ~multi_val(); - inline __isl_give isl_multi_val *copy() const &; - inline __isl_give isl_multi_val *copy() && = delete; - inline __isl_keep isl_multi_val *get() const; - inline __isl_give isl_multi_val *release(); - inline bool is_null() const; + inline /* implicit */ ast_expr_op_zdiv_r(); + inline /* implicit */ ast_expr_op_zdiv_r(const ast_expr_op_zdiv_r &obj); + inline ast_expr_op_zdiv_r &operator=(ast_expr_op_zdiv_r obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::multi_val add(isl::multi_val multi2) const; - inline isl::multi_val add(isl::val v) const; - inline isl::multi_val add_dims(isl::dim type, unsigned int n) const; - inline isl::multi_val align_params(isl::space model) const; - inline isl_size dim(isl::dim type) const; - inline isl::multi_val drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::multi_val factor_range() const; - inline int find_dim_by_id(isl::dim type, const isl::id &id) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::multi_val flat_range_product(isl::multi_val multi2) const; - inline isl::multi_val flatten_range() const; - inline isl::multi_val from_range() const; - inline isl::val get_at(int pos) const; - inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; - inline isl::space get_domain_space() const; - inline isl::val_list get_list() const; - inline isl::space get_space() const; - inline isl::id get_tuple_id(isl::dim type) const; - inline std::string get_tuple_name(isl::dim type) const; - inline isl::val get_val(int pos) const; - inline boolean has_tuple_id(isl::dim type) const; - inline isl::multi_val insert_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean involves_nan() const; - inline boolean is_zero() const; - inline isl::multi_val max(isl::multi_val multi2) const; - inline isl::multi_val min(isl::multi_val multi2) const; - inline isl::multi_val mod_multi_val(isl::multi_val mv) const; - inline isl::multi_val mod_val(isl::val v) const; - inline isl::multi_val neg() const; - inline boolean plain_is_equal(const isl::multi_val &multi2) const; - inline isl::multi_val product(isl::multi_val multi2) const; - inline isl::multi_val project_domain_on_params() const; - inline isl::multi_val range_factor_domain() const; - inline isl::multi_val range_factor_range() const; - inline boolean range_is_wrapping() const; - inline isl::multi_val range_product(isl::multi_val multi2) const; - inline isl::multi_val range_splice(unsigned int pos, isl::multi_val multi2) const; - inline isl::multi_val reset_tuple_id(isl::dim type) const; - inline isl::multi_val reset_user() const; - inline isl::multi_val scale(isl::multi_val mv) const; - inline isl::multi_val scale(isl::val v) const; - inline isl::multi_val scale_down(isl::multi_val mv) const; - inline isl::multi_val scale_down(isl::val v) const; - inline isl::multi_val set_at(int pos, isl::val el) const; - inline isl::multi_val set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; - inline isl::multi_val set_tuple_id(isl::dim type, isl::id id) const; - inline isl::multi_val set_tuple_name(isl::dim type, const std::string &s) const; - inline isl::multi_val set_val(int pos, isl::val el) const; - inline isl_size size() const; - inline isl::multi_val splice(unsigned int in_pos, unsigned int out_pos, isl::multi_val multi2) const; - inline isl::multi_val sub(isl::multi_val multi2) const; - static inline isl::multi_val zero(isl::space space); }; -// declarations for isl::point -inline point manage(__isl_take isl_point *ptr); -inline point manage_copy(__isl_keep isl_point *ptr); +// declarations for isl::ast_node +inline ast_node manage(__isl_take isl_ast_node *ptr); +inline ast_node manage_copy(__isl_keep isl_ast_node *ptr); -class point { - friend inline point manage(__isl_take isl_point *ptr); - friend inline point manage_copy(__isl_keep isl_point *ptr); +class ast_node { + friend inline ast_node manage(__isl_take isl_ast_node *ptr); + friend inline ast_node manage_copy(__isl_keep isl_ast_node *ptr); - isl_point *ptr = nullptr; +protected: + isl_ast_node *ptr = nullptr; - inline explicit point(__isl_take isl_point *ptr); + inline explicit ast_node(__isl_take isl_ast_node *ptr); public: - inline /* implicit */ point(); - inline /* implicit */ point(const point &obj); - inline explicit point(isl::space dim); - inline point &operator=(point obj); - inline ~point(); - inline __isl_give isl_point *copy() const &; - inline __isl_give isl_point *copy() && = delete; - inline __isl_keep isl_point *get() const; - inline __isl_give isl_point *release(); + inline /* implicit */ ast_node(); + inline /* implicit */ ast_node(const ast_node &obj); + inline ast_node &operator=(ast_node obj); + inline ~ast_node(); + inline __isl_give isl_ast_node *copy() const &; + inline __isl_give isl_ast_node *copy() && = delete; + inline __isl_keep isl_ast_node *get() const; + inline __isl_give isl_ast_node *release(); inline bool is_null() const; +private: + template ::value>::type> + inline boolean isa_type(T subtype) const; +public: + template inline boolean isa() const; + template inline T as() const; inline isl::ctx ctx() const; - inline void dump() const; - inline isl::point add_ui(isl::dim type, int pos, unsigned int val) const; - inline isl::val get_coordinate_val(isl::dim type, int pos) const; - inline isl::multi_val get_multi_val() const; - inline isl::space get_space() const; - inline isl::point set_coordinate_val(isl::dim type, int pos, isl::val v) const; - inline isl::point sub_ui(isl::dim type, int pos, unsigned int val) const; + inline isl::id annotation() const; + inline isl::id get_annotation() const; + inline std::string to_C_str() const; + inline isl::ast_node_list to_list() const; }; -// declarations for isl::pw_aff -inline pw_aff manage(__isl_take isl_pw_aff *ptr); -inline pw_aff manage_copy(__isl_keep isl_pw_aff *ptr); +// declarations for isl::ast_node_block -class pw_aff { - friend inline pw_aff manage(__isl_take isl_pw_aff *ptr); - friend inline pw_aff manage_copy(__isl_keep isl_pw_aff *ptr); +class ast_node_block : public ast_node { + template + friend boolean ast_node::isa() const; + friend ast_node_block ast_node::as() const; + static const auto type = isl_ast_node_block; - isl_pw_aff *ptr = nullptr; +protected: + inline explicit ast_node_block(__isl_take isl_ast_node *ptr); - inline explicit pw_aff(__isl_take isl_pw_aff *ptr); +public: + inline /* implicit */ ast_node_block(); + inline /* implicit */ ast_node_block(const ast_node_block &obj); + inline ast_node_block &operator=(ast_node_block obj); + inline isl::ctx ctx() const; + + inline isl::ast_node_list children() const; + inline isl::ast_node_list get_children() const; +}; + +// declarations for isl::ast_node_for + +class ast_node_for : public ast_node { + template + friend boolean ast_node::isa() const; + friend ast_node_for ast_node::as() const; + static const auto type = isl_ast_node_for; + +protected: + inline explicit ast_node_for(__isl_take isl_ast_node *ptr); public: - inline /* implicit */ pw_aff(); - inline /* implicit */ pw_aff(const pw_aff &obj); - inline /* implicit */ pw_aff(isl::aff aff); - inline explicit pw_aff(isl::ctx ctx, const std::string &str); - inline explicit pw_aff(isl::set domain, isl::val v); - inline explicit pw_aff(isl::local_space ls); - inline pw_aff &operator=(pw_aff obj); - inline ~pw_aff(); - inline __isl_give isl_pw_aff *copy() const &; - inline __isl_give isl_pw_aff *copy() && = delete; - inline __isl_keep isl_pw_aff *get() const; - inline __isl_give isl_pw_aff *release(); - inline bool is_null() const; + inline /* implicit */ ast_node_for(); + inline /* implicit */ ast_node_for(const ast_node_for &obj); + inline ast_node_for &operator=(ast_node_for obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::pw_aff add(isl::pw_aff pwaff2) const; - inline isl::pw_aff add_constant(isl::val v) const; - inline isl::pw_aff add_dims(isl::dim type, unsigned int n) const; - inline isl::pw_aff align_params(isl::space model) const; - static inline isl::pw_aff alloc(isl::set set, isl::aff aff); - inline isl::aff as_aff() const; - inline isl::set bind(isl::id id) const; - inline isl::pw_aff bind_domain(isl::multi_id tuple) const; - inline isl::pw_aff bind_domain_wrapped_domain(isl::multi_id tuple) const; - inline isl::pw_aff ceil() const; - inline isl::pw_aff coalesce() const; - inline isl::pw_aff cond(isl::pw_aff pwaff_true, isl::pw_aff pwaff_false) const; - inline isl_size dim(isl::dim type) const; - inline isl::pw_aff div(isl::pw_aff pa2) const; - inline isl::set domain() const; - inline isl::pw_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::pw_aff drop_unused_params() const; - static inline isl::pw_aff empty(isl::space space); - inline isl::map eq_map(isl::pw_aff pa2) const; - inline isl::set eq_set(isl::pw_aff pwaff2) const; - inline isl::val eval(isl::point pnt) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::pw_aff floor() const; - inline stat foreach_piece(const std::function &fn) const; - inline isl::pw_aff from_range() const; - inline isl::map ge_map(isl::pw_aff pa2) const; - inline isl::set ge_set(isl::pw_aff pwaff2) const; - inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; - inline std::string get_dim_name(isl::dim type, unsigned int pos) const; - inline isl::space get_domain_space() const; - inline uint32_t get_hash() const; - inline isl::space get_space() const; - inline isl::id get_tuple_id(isl::dim type) const; - inline isl::pw_aff gist(isl::set context) const; - inline isl::pw_aff gist_params(isl::set context) const; - inline isl::map gt_map(isl::pw_aff pa2) const; - inline isl::set gt_set(isl::pw_aff pwaff2) const; - inline boolean has_dim_id(isl::dim type, unsigned int pos) const; - inline boolean has_tuple_id(isl::dim type) const; - inline isl::pw_aff insert_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::pw_aff insert_domain(isl::space domain) const; - inline isl::pw_aff intersect_domain(isl::set set) const; - inline isl::pw_aff intersect_domain_wrapped_domain(isl::set set) const; - inline isl::pw_aff intersect_domain_wrapped_range(isl::set set) const; - inline isl::pw_aff intersect_params(isl::set set) const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean involves_nan() const; - inline boolean involves_param_id(const isl::id &id) const; - inline boolean is_cst() const; - inline boolean is_empty() const; - inline boolean is_equal(const isl::pw_aff &pa2) const; - inline boolean isa_aff() const; - inline isl::map le_map(isl::pw_aff pa2) const; - inline isl::set le_set(isl::pw_aff pwaff2) const; - inline isl::map lt_map(isl::pw_aff pa2) const; - inline isl::set lt_set(isl::pw_aff pwaff2) const; - inline isl::pw_aff max(isl::pw_aff pwaff2) const; - inline isl::pw_aff min(isl::pw_aff pwaff2) const; - inline isl::pw_aff mod(isl::val mod) const; - inline isl::pw_aff move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; - inline isl::pw_aff mul(isl::pw_aff pwaff2) const; - inline isl_size n_piece() const; - static inline isl::pw_aff nan_on_domain(isl::local_space ls); - static inline isl::pw_aff nan_on_domain_space(isl::space space); - inline isl::set ne_set(isl::pw_aff pwaff2) const; - inline isl::pw_aff neg() const; - inline isl::set non_zero_set() const; - inline isl::set nonneg_set() const; - static inline isl::pw_aff param_on_domain(isl::set domain, isl::id id); - inline isl::set params() const; - inline int plain_cmp(const isl::pw_aff &pa2) const; - inline boolean plain_is_equal(const isl::pw_aff &pwaff2) const; - inline isl::set pos_set() const; - inline isl::pw_aff project_domain_on_params() const; - inline isl::pw_aff pullback(isl::multi_aff ma) const; - inline isl::pw_aff pullback(isl::multi_pw_aff mpa) const; - inline isl::pw_aff pullback(isl::pw_multi_aff pma) const; - inline isl::pw_aff reset_tuple_id(isl::dim type) const; - inline isl::pw_aff reset_user() const; - inline isl::pw_aff scale(isl::val v) const; - inline isl::pw_aff scale_down(isl::val f) const; - inline isl::pw_aff set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; - inline isl::pw_aff set_tuple_id(isl::dim type, isl::id id) const; - inline isl::pw_aff sub(isl::pw_aff pwaff2) const; - inline isl::pw_aff subtract_domain(isl::set set) const; - inline isl::pw_aff tdiv_q(isl::pw_aff pa2) const; - inline isl::pw_aff tdiv_r(isl::pw_aff pa2) const; - inline isl::pw_aff union_add(isl::pw_aff pwaff2) const; - inline isl::pw_aff union_max(isl::pw_aff pwaff2) const; - inline isl::pw_aff union_min(isl::pw_aff pwaff2) const; - static inline isl::pw_aff var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos); - inline isl::set zero_set() const; -}; - -// declarations for isl::pw_aff_list -inline pw_aff_list manage(__isl_take isl_pw_aff_list *ptr); -inline pw_aff_list manage_copy(__isl_keep isl_pw_aff_list *ptr); - -class pw_aff_list { - friend inline pw_aff_list manage(__isl_take isl_pw_aff_list *ptr); - friend inline pw_aff_list manage_copy(__isl_keep isl_pw_aff_list *ptr); - - isl_pw_aff_list *ptr = nullptr; - - inline explicit pw_aff_list(__isl_take isl_pw_aff_list *ptr); - -public: - inline /* implicit */ pw_aff_list(); - inline /* implicit */ pw_aff_list(const pw_aff_list &obj); - inline pw_aff_list &operator=(pw_aff_list obj); - inline ~pw_aff_list(); - inline __isl_give isl_pw_aff_list *copy() const &; - inline __isl_give isl_pw_aff_list *copy() && = delete; - inline __isl_keep isl_pw_aff_list *get() const; - inline __isl_give isl_pw_aff_list *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::pw_aff_list add(isl::pw_aff el) const; - static inline isl::pw_aff_list alloc(isl::ctx ctx, int n); - inline isl::pw_aff_list clear() const; - inline isl::pw_aff_list concat(isl::pw_aff_list list2) const; - inline isl::pw_aff_list drop(unsigned int first, unsigned int n) const; - inline isl::set eq_set(isl::pw_aff_list list2) const; - inline stat foreach(const std::function &fn) const; - static inline isl::pw_aff_list from_pw_aff(isl::pw_aff el); - inline isl::set ge_set(isl::pw_aff_list list2) const; - inline isl::pw_aff get_at(int index) const; - inline isl::pw_aff get_pw_aff(int index) const; - inline isl::set gt_set(isl::pw_aff_list list2) const; - inline isl::pw_aff_list insert(unsigned int pos, isl::pw_aff el) const; - inline isl::set le_set(isl::pw_aff_list list2) const; - inline isl::set lt_set(isl::pw_aff_list list2) const; - inline isl::pw_aff max() const; - inline isl::pw_aff min() const; - inline isl_size n_pw_aff() const; - inline isl::set ne_set(isl::pw_aff_list list2) const; - inline isl::pw_aff_list reverse() const; - inline isl::pw_aff_list set_pw_aff(int index, isl::pw_aff el) const; - inline isl_size size() const; - inline isl::pw_aff_list swap(unsigned int pos1, unsigned int pos2) const; + inline isl::ast_node body() const; + inline isl::ast_node get_body() const; + inline isl::ast_expr cond() const; + inline isl::ast_expr get_cond() const; + inline isl::ast_expr inc() const; + inline isl::ast_expr get_inc() const; + inline isl::ast_expr init() const; + inline isl::ast_expr get_init() const; + inline boolean is_degenerate() const; + inline isl::ast_expr iterator() const; + inline isl::ast_expr get_iterator() const; }; -// declarations for isl::pw_multi_aff -inline pw_multi_aff manage(__isl_take isl_pw_multi_aff *ptr); -inline pw_multi_aff manage_copy(__isl_keep isl_pw_multi_aff *ptr); - -class pw_multi_aff { - friend inline pw_multi_aff manage(__isl_take isl_pw_multi_aff *ptr); - friend inline pw_multi_aff manage_copy(__isl_keep isl_pw_multi_aff *ptr); +// declarations for isl::ast_node_if - isl_pw_multi_aff *ptr = nullptr; +class ast_node_if : public ast_node { + template + friend boolean ast_node::isa() const; + friend ast_node_if ast_node::as() const; + static const auto type = isl_ast_node_if; - inline explicit pw_multi_aff(__isl_take isl_pw_multi_aff *ptr); +protected: + inline explicit ast_node_if(__isl_take isl_ast_node *ptr); public: - inline /* implicit */ pw_multi_aff(); - inline /* implicit */ pw_multi_aff(const pw_multi_aff &obj); - inline /* implicit */ pw_multi_aff(isl::multi_aff ma); - inline /* implicit */ pw_multi_aff(isl::pw_aff pa); - inline explicit pw_multi_aff(isl::ctx ctx, const std::string &str); - inline pw_multi_aff &operator=(pw_multi_aff obj); - inline ~pw_multi_aff(); - inline __isl_give isl_pw_multi_aff *copy() const &; - inline __isl_give isl_pw_multi_aff *copy() && = delete; - inline __isl_keep isl_pw_multi_aff *get() const; - inline __isl_give isl_pw_multi_aff *release(); - inline bool is_null() const; + inline /* implicit */ ast_node_if(); + inline /* implicit */ ast_node_if(const ast_node_if &obj); + inline ast_node_if &operator=(ast_node_if obj); inline isl::ctx ctx() const; - inline void dump() const; - inline isl::pw_multi_aff add(isl::pw_multi_aff pma2) const; - inline isl::pw_multi_aff add_constant(isl::multi_val mv) const; - inline isl::pw_multi_aff add_constant(isl::val v) const; - inline isl::pw_multi_aff align_params(isl::space model) const; - static inline isl::pw_multi_aff alloc(isl::set set, isl::multi_aff maff); - inline isl::multi_aff as_multi_aff() const; - inline isl::pw_multi_aff bind_domain(isl::multi_id tuple) const; - inline isl::pw_multi_aff bind_domain_wrapped_domain(isl::multi_id tuple) const; - inline isl::pw_multi_aff coalesce() const; - inline isl_size dim(isl::dim type) const; - inline isl::set domain() const; - static inline isl::pw_multi_aff domain_map(isl::space space); - inline isl::pw_multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::pw_multi_aff drop_unused_params() const; - static inline isl::pw_multi_aff empty(isl::space space); - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::pw_multi_aff fix_si(isl::dim type, unsigned int pos, int value) const; - inline isl::pw_multi_aff flat_range_product(isl::pw_multi_aff pma2) const; - inline stat foreach_piece(const std::function &fn) const; - static inline isl::pw_multi_aff from_domain(isl::set set); - static inline isl::pw_multi_aff from_map(isl::map map); - static inline isl::pw_multi_aff from_multi_pw_aff(isl::multi_pw_aff mpa); - static inline isl::pw_multi_aff from_set(isl::set set); - inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; - inline std::string get_dim_name(isl::dim type, unsigned int pos) const; - inline isl::space get_domain_space() const; - inline isl::pw_aff get_pw_aff(int pos) const; - inline isl::space get_space() const; - inline isl::id get_tuple_id(isl::dim type) const; - inline std::string get_tuple_name(isl::dim type) const; - inline isl::pw_multi_aff gist(isl::set set) const; - inline isl::pw_multi_aff gist_params(isl::set set) const; - inline boolean has_tuple_id(isl::dim type) const; - inline boolean has_tuple_name(isl::dim type) const; - static inline isl::pw_multi_aff identity(isl::space space); - static inline isl::pw_multi_aff identity_on_domain(isl::space space); - inline isl::pw_multi_aff insert_domain(isl::space domain) const; - inline isl::pw_multi_aff intersect_domain(isl::set set) const; - inline isl::pw_multi_aff intersect_domain_wrapped_domain(isl::set set) const; - inline isl::pw_multi_aff intersect_domain_wrapped_range(isl::set set) const; - inline isl::pw_multi_aff intersect_params(isl::set set) const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean involves_locals() const; - inline boolean involves_nan() const; - inline boolean involves_param_id(const isl::id &id) const; - inline boolean is_equal(const isl::pw_multi_aff &pma2) const; - inline boolean isa_multi_aff() const; - inline isl::multi_val max_multi_val() const; - inline isl::multi_val min_multi_val() const; - static inline isl::pw_multi_aff multi_val_on_domain(isl::set domain, isl::multi_val mv); - inline isl_size n_piece() const; - inline isl::pw_multi_aff neg() const; - inline boolean plain_is_equal(const isl::pw_multi_aff &pma2) const; - inline isl::pw_multi_aff preimage_domain_wrapped_domain(isl::pw_multi_aff pma2) const; - inline isl::pw_multi_aff product(isl::pw_multi_aff pma2) const; - inline isl::pw_multi_aff project_domain_on_params() const; - static inline isl::pw_multi_aff project_out_map(isl::space space, isl::dim type, unsigned int first, unsigned int n); - inline isl::pw_multi_aff pullback(isl::multi_aff ma) const; - inline isl::pw_multi_aff pullback(isl::pw_multi_aff pma2) const; - inline isl::pw_multi_aff range_factor_domain() const; - inline isl::pw_multi_aff range_factor_range() const; - static inline isl::pw_multi_aff range_map(isl::space space); - inline isl::pw_multi_aff range_product(isl::pw_multi_aff pma2) const; - inline isl::pw_multi_aff reset_tuple_id(isl::dim type) const; - inline isl::pw_multi_aff reset_user() const; - inline isl::pw_multi_aff scale(isl::val v) const; - inline isl::pw_multi_aff scale_down(isl::val v) const; - inline isl::pw_multi_aff scale_multi_val(isl::multi_val mv) const; - inline isl::pw_multi_aff set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; - inline isl::pw_multi_aff set_pw_aff(unsigned int pos, isl::pw_aff pa) const; - inline isl::pw_multi_aff set_tuple_id(isl::dim type, isl::id id) const; - inline isl::pw_multi_aff sub(isl::pw_multi_aff pma2) const; - inline isl::pw_multi_aff subtract_domain(isl::set set) const; - inline isl::pw_multi_aff union_add(isl::pw_multi_aff pma2) const; - inline isl::pw_multi_aff union_lexmax(isl::pw_multi_aff pma2) const; - inline isl::pw_multi_aff union_lexmin(isl::pw_multi_aff pma2) const; - static inline isl::pw_multi_aff zero(isl::space space); + inline isl::ast_expr cond() const; + inline isl::ast_expr get_cond() const; + inline isl::ast_node else_node() const; + inline isl::ast_node get_else_node() const; + inline boolean has_else_node() const; + inline isl::ast_node then_node() const; + inline isl::ast_node get_then_node() const; }; -// declarations for isl::pw_multi_aff_list -inline pw_multi_aff_list manage(__isl_take isl_pw_multi_aff_list *ptr); -inline pw_multi_aff_list manage_copy(__isl_keep isl_pw_multi_aff_list *ptr); +// declarations for isl::ast_node_list +inline ast_node_list manage(__isl_take isl_ast_node_list *ptr); +inline ast_node_list manage_copy(__isl_keep isl_ast_node_list *ptr); -class pw_multi_aff_list { - friend inline pw_multi_aff_list manage(__isl_take isl_pw_multi_aff_list *ptr); - friend inline pw_multi_aff_list manage_copy(__isl_keep isl_pw_multi_aff_list *ptr); +class ast_node_list { + friend inline ast_node_list manage(__isl_take isl_ast_node_list *ptr); + friend inline ast_node_list manage_copy(__isl_keep isl_ast_node_list *ptr); - isl_pw_multi_aff_list *ptr = nullptr; +protected: + isl_ast_node_list *ptr = nullptr; - inline explicit pw_multi_aff_list(__isl_take isl_pw_multi_aff_list *ptr); + inline explicit ast_node_list(__isl_take isl_ast_node_list *ptr); public: - inline /* implicit */ pw_multi_aff_list(); - inline /* implicit */ pw_multi_aff_list(const pw_multi_aff_list &obj); - inline pw_multi_aff_list &operator=(pw_multi_aff_list obj); - inline ~pw_multi_aff_list(); - inline __isl_give isl_pw_multi_aff_list *copy() const &; - inline __isl_give isl_pw_multi_aff_list *copy() && = delete; - inline __isl_keep isl_pw_multi_aff_list *get() const; - inline __isl_give isl_pw_multi_aff_list *release(); + inline /* implicit */ ast_node_list(); + inline /* implicit */ ast_node_list(const ast_node_list &obj); + inline explicit ast_node_list(isl::ctx ctx, int n); + inline explicit ast_node_list(isl::ast_node el); + inline ast_node_list &operator=(ast_node_list obj); + inline ~ast_node_list(); + inline __isl_give isl_ast_node_list *copy() const &; + inline __isl_give isl_ast_node_list *copy() && = delete; + inline __isl_keep isl_ast_node_list *get() const; + inline __isl_give isl_ast_node_list *release(); inline bool is_null() const; inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::pw_multi_aff_list add(isl::pw_multi_aff el) const; - static inline isl::pw_multi_aff_list alloc(isl::ctx ctx, int n); - inline isl::pw_multi_aff_list clear() const; - inline isl::pw_multi_aff_list concat(isl::pw_multi_aff_list list2) const; - inline isl::pw_multi_aff_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::pw_multi_aff_list from_pw_multi_aff(isl::pw_multi_aff el); - inline isl::pw_multi_aff get_at(int index) const; - inline isl::pw_multi_aff get_pw_multi_aff(int index) const; - inline isl::pw_multi_aff_list insert(unsigned int pos, isl::pw_multi_aff el) const; - inline isl_size n_pw_multi_aff() const; - inline isl::pw_multi_aff_list reverse() const; - inline isl::pw_multi_aff_list set_pw_multi_aff(int index, isl::pw_multi_aff el) const; - inline isl_size size() const; - inline isl::pw_multi_aff_list swap(unsigned int pos1, unsigned int pos2) const; -}; - -// declarations for isl::pw_qpolynomial -inline pw_qpolynomial manage(__isl_take isl_pw_qpolynomial *ptr); -inline pw_qpolynomial manage_copy(__isl_keep isl_pw_qpolynomial *ptr); -class pw_qpolynomial { - friend inline pw_qpolynomial manage(__isl_take isl_pw_qpolynomial *ptr); - friend inline pw_qpolynomial manage_copy(__isl_keep isl_pw_qpolynomial *ptr); - - isl_pw_qpolynomial *ptr = nullptr; - - inline explicit pw_qpolynomial(__isl_take isl_pw_qpolynomial *ptr); - -public: - inline /* implicit */ pw_qpolynomial(); - inline /* implicit */ pw_qpolynomial(const pw_qpolynomial &obj); - inline explicit pw_qpolynomial(isl::ctx ctx, const std::string &str); - inline pw_qpolynomial &operator=(pw_qpolynomial obj); - inline ~pw_qpolynomial(); - inline __isl_give isl_pw_qpolynomial *copy() const &; - inline __isl_give isl_pw_qpolynomial *copy() && = delete; - inline __isl_keep isl_pw_qpolynomial *get() const; - inline __isl_give isl_pw_qpolynomial *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::pw_qpolynomial add(isl::pw_qpolynomial pwqp2) const; - inline isl::pw_qpolynomial add_dims(isl::dim type, unsigned int n) const; - static inline isl::pw_qpolynomial alloc(isl::set set, isl::qpolynomial qp); - inline isl::qpolynomial as_qpolynomial() const; - inline isl::pw_qpolynomial coalesce() const; - inline isl_size dim(isl::dim type) const; - inline isl::set domain() const; - inline isl::pw_qpolynomial drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::pw_qpolynomial drop_unused_params() const; - inline isl::val eval(isl::point pnt) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::pw_qpolynomial fix_val(isl::dim type, unsigned int n, isl::val v) const; - inline stat foreach_piece(const std::function &fn) const; - static inline isl::pw_qpolynomial from_pw_aff(isl::pw_aff pwaff); - static inline isl::pw_qpolynomial from_qpolynomial(isl::qpolynomial qp); - inline isl::pw_qpolynomial from_range() const; - inline isl::space get_domain_space() const; - inline isl::space get_space() const; - inline isl::pw_qpolynomial gist(isl::set context) const; - inline isl::pw_qpolynomial gist_params(isl::set context) const; - inline boolean has_equal_space(const isl::pw_qpolynomial &pwqp2) const; - inline isl::pw_qpolynomial insert_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::pw_qpolynomial intersect_domain(isl::set set) const; - inline isl::pw_qpolynomial intersect_domain_wrapped_domain(isl::set set) const; - inline isl::pw_qpolynomial intersect_domain_wrapped_range(isl::set set) const; - inline isl::pw_qpolynomial intersect_params(isl::set set) const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean involves_nan() const; - inline boolean involves_param_id(const isl::id &id) const; - inline boolean is_zero() const; - inline boolean isa_qpolynomial() const; - inline isl::val max() const; - inline isl::val min() const; - inline isl::pw_qpolynomial move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; - inline isl::pw_qpolynomial mul(isl::pw_qpolynomial pwqp2) const; - inline isl_size n_piece() const; - inline isl::pw_qpolynomial neg() const; - inline boolean plain_is_equal(const isl::pw_qpolynomial &pwqp2) const; - inline isl::pw_qpolynomial pow(unsigned int exponent) const; - inline isl::pw_qpolynomial project_domain_on_params() const; - inline isl::pw_qpolynomial reset_domain_space(isl::space space) const; - inline isl::pw_qpolynomial reset_user() const; - inline isl::pw_qpolynomial scale_down_val(isl::val v) const; - inline isl::pw_qpolynomial scale_val(isl::val v) const; - inline isl::pw_qpolynomial split_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::pw_qpolynomial split_periods(int max_periods) const; - inline isl::pw_qpolynomial sub(isl::pw_qpolynomial pwqp2) const; - inline isl::pw_qpolynomial subtract_domain(isl::set set) const; - inline isl::pw_qpolynomial to_polynomial(int sign) const; - static inline isl::pw_qpolynomial zero(isl::space space); + inline isl::ast_node_list add(isl::ast_node el) const; + inline isl::ast_node at(int index) const; + inline isl::ast_node get_at(int index) const; + inline isl::ast_node_list clear() const; + inline isl::ast_node_list concat(isl::ast_node_list list2) const; + inline isl::ast_node_list drop(unsigned int first, unsigned int n) const; + inline stat foreach(const std::function &fn) const; + inline isl::ast_node_list insert(unsigned int pos, isl::ast_node el) const; + inline class size size() const; }; -// declarations for isl::pw_qpolynomial_fold_list -inline pw_qpolynomial_fold_list manage(__isl_take isl_pw_qpolynomial_fold_list *ptr); -inline pw_qpolynomial_fold_list manage_copy(__isl_keep isl_pw_qpolynomial_fold_list *ptr); - -class pw_qpolynomial_fold_list { - friend inline pw_qpolynomial_fold_list manage(__isl_take isl_pw_qpolynomial_fold_list *ptr); - friend inline pw_qpolynomial_fold_list manage_copy(__isl_keep isl_pw_qpolynomial_fold_list *ptr); +// declarations for isl::ast_node_mark - isl_pw_qpolynomial_fold_list *ptr = nullptr; +class ast_node_mark : public ast_node { + template + friend boolean ast_node::isa() const; + friend ast_node_mark ast_node::as() const; + static const auto type = isl_ast_node_mark; - inline explicit pw_qpolynomial_fold_list(__isl_take isl_pw_qpolynomial_fold_list *ptr); +protected: + inline explicit ast_node_mark(__isl_take isl_ast_node *ptr); public: - inline /* implicit */ pw_qpolynomial_fold_list(); - inline /* implicit */ pw_qpolynomial_fold_list(const pw_qpolynomial_fold_list &obj); - inline pw_qpolynomial_fold_list &operator=(pw_qpolynomial_fold_list obj); - inline ~pw_qpolynomial_fold_list(); - inline __isl_give isl_pw_qpolynomial_fold_list *copy() const &; - inline __isl_give isl_pw_qpolynomial_fold_list *copy() && = delete; - inline __isl_keep isl_pw_qpolynomial_fold_list *get() const; - inline __isl_give isl_pw_qpolynomial_fold_list *release(); - inline bool is_null() const; + inline /* implicit */ ast_node_mark(); + inline /* implicit */ ast_node_mark(const ast_node_mark &obj); + inline ast_node_mark &operator=(ast_node_mark obj); inline isl::ctx ctx() const; - inline void dump() const; + inline isl::id id() const; + inline isl::id get_id() const; + inline isl::ast_node node() const; + inline isl::ast_node get_node() const; }; -// declarations for isl::pw_qpolynomial_list -inline pw_qpolynomial_list manage(__isl_take isl_pw_qpolynomial_list *ptr); -inline pw_qpolynomial_list manage_copy(__isl_keep isl_pw_qpolynomial_list *ptr); +// declarations for isl::ast_node_user -class pw_qpolynomial_list { - friend inline pw_qpolynomial_list manage(__isl_take isl_pw_qpolynomial_list *ptr); - friend inline pw_qpolynomial_list manage_copy(__isl_keep isl_pw_qpolynomial_list *ptr); +class ast_node_user : public ast_node { + template + friend boolean ast_node::isa() const; + friend ast_node_user ast_node::as() const; + static const auto type = isl_ast_node_user; - isl_pw_qpolynomial_list *ptr = nullptr; - - inline explicit pw_qpolynomial_list(__isl_take isl_pw_qpolynomial_list *ptr); +protected: + inline explicit ast_node_user(__isl_take isl_ast_node *ptr); public: - inline /* implicit */ pw_qpolynomial_list(); - inline /* implicit */ pw_qpolynomial_list(const pw_qpolynomial_list &obj); - inline pw_qpolynomial_list &operator=(pw_qpolynomial_list obj); - inline ~pw_qpolynomial_list(); - inline __isl_give isl_pw_qpolynomial_list *copy() const &; - inline __isl_give isl_pw_qpolynomial_list *copy() && = delete; - inline __isl_keep isl_pw_qpolynomial_list *get() const; - inline __isl_give isl_pw_qpolynomial_list *release(); - inline bool is_null() const; + inline /* implicit */ ast_node_user(); + inline /* implicit */ ast_node_user(const ast_node_user &obj); + inline ast_node_user &operator=(ast_node_user obj); inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::pw_qpolynomial_list add(isl::pw_qpolynomial el) const; - static inline isl::pw_qpolynomial_list alloc(isl::ctx ctx, int n); - inline isl::pw_qpolynomial_list clear() const; - inline isl::pw_qpolynomial_list concat(isl::pw_qpolynomial_list list2) const; - inline isl::pw_qpolynomial_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::pw_qpolynomial_list from_pw_qpolynomial(isl::pw_qpolynomial el); - inline isl::pw_qpolynomial get_at(int index) const; - inline isl::pw_qpolynomial get_pw_qpolynomial(int index) const; - inline isl::pw_qpolynomial_list insert(unsigned int pos, isl::pw_qpolynomial el) const; - inline isl_size n_pw_qpolynomial() const; - inline isl::pw_qpolynomial_list reverse() const; - inline isl::pw_qpolynomial_list set_pw_qpolynomial(int index, isl::pw_qpolynomial el) const; - inline isl_size size() const; - inline isl::pw_qpolynomial_list swap(unsigned int pos1, unsigned int pos2) const; -}; - -// declarations for isl::qpolynomial -inline qpolynomial manage(__isl_take isl_qpolynomial *ptr); -inline qpolynomial manage_copy(__isl_keep isl_qpolynomial *ptr); - -class qpolynomial { - friend inline qpolynomial manage(__isl_take isl_qpolynomial *ptr); - friend inline qpolynomial manage_copy(__isl_keep isl_qpolynomial *ptr); - - isl_qpolynomial *ptr = nullptr; - inline explicit qpolynomial(__isl_take isl_qpolynomial *ptr); - -public: - inline /* implicit */ qpolynomial(); - inline /* implicit */ qpolynomial(const qpolynomial &obj); - inline qpolynomial &operator=(qpolynomial obj); - inline ~qpolynomial(); - inline __isl_give isl_qpolynomial *copy() const &; - inline __isl_give isl_qpolynomial *copy() && = delete; - inline __isl_keep isl_qpolynomial *get() const; - inline __isl_give isl_qpolynomial *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::qpolynomial add(isl::qpolynomial qp2) const; - inline isl::qpolynomial add_dims(isl::dim type, unsigned int n) const; - inline isl::qpolynomial align_params(isl::space model) const; - inline stat as_polynomial_on_domain(const isl::basic_set &bset, const std::function &fn) const; - inline isl_size dim(isl::dim type) const; - inline isl::qpolynomial drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::val eval(isl::point pnt) const; - inline stat foreach_term(const std::function &fn) const; - static inline isl::qpolynomial from_aff(isl::aff aff); - static inline isl::qpolynomial from_constraint(isl::constraint c, isl::dim type, unsigned int pos); - static inline isl::qpolynomial from_term(isl::term term); - inline isl::val get_constant_val() const; - inline isl::space get_domain_space() const; - inline isl::space get_space() const; - inline isl::qpolynomial gist(isl::set context) const; - inline isl::qpolynomial gist_params(isl::set context) const; - inline isl::qpolynomial homogenize() const; - static inline isl::qpolynomial infty_on_domain(isl::space domain); - inline isl::qpolynomial insert_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean is_infty() const; - inline boolean is_nan() const; - inline boolean is_neginfty() const; - inline boolean is_zero() const; - inline isl::qpolynomial move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; - inline isl::qpolynomial mul(isl::qpolynomial qp2) const; - static inline isl::qpolynomial nan_on_domain(isl::space domain); - inline isl::qpolynomial neg() const; - static inline isl::qpolynomial neginfty_on_domain(isl::space domain); - static inline isl::qpolynomial one_on_domain(isl::space domain); - inline boolean plain_is_equal(const isl::qpolynomial &qp2) const; - inline isl::qpolynomial pow(unsigned int power) const; - inline isl::qpolynomial project_domain_on_params() const; - inline isl::qpolynomial scale_down_val(isl::val v) const; - inline isl::qpolynomial scale_val(isl::val v) const; - inline int sgn() const; - inline isl::qpolynomial sub(isl::qpolynomial qp2) const; - static inline isl::qpolynomial val_on_domain(isl::space space, isl::val val); - static inline isl::qpolynomial var_on_domain(isl::space domain, isl::dim type, unsigned int pos); - static inline isl::qpolynomial zero_on_domain(isl::space domain); + inline isl::ast_expr expr() const; + inline isl::ast_expr get_expr() const; }; -// declarations for isl::qpolynomial_list -inline qpolynomial_list manage(__isl_take isl_qpolynomial_list *ptr); -inline qpolynomial_list manage_copy(__isl_keep isl_qpolynomial_list *ptr); +// declarations for isl::basic_map +inline basic_map manage(__isl_take isl_basic_map *ptr); +inline basic_map manage_copy(__isl_keep isl_basic_map *ptr); -class qpolynomial_list { - friend inline qpolynomial_list manage(__isl_take isl_qpolynomial_list *ptr); - friend inline qpolynomial_list manage_copy(__isl_keep isl_qpolynomial_list *ptr); +class basic_map { + friend inline basic_map manage(__isl_take isl_basic_map *ptr); + friend inline basic_map manage_copy(__isl_keep isl_basic_map *ptr); - isl_qpolynomial_list *ptr = nullptr; +protected: + isl_basic_map *ptr = nullptr; - inline explicit qpolynomial_list(__isl_take isl_qpolynomial_list *ptr); + inline explicit basic_map(__isl_take isl_basic_map *ptr); public: - inline /* implicit */ qpolynomial_list(); - inline /* implicit */ qpolynomial_list(const qpolynomial_list &obj); - inline qpolynomial_list &operator=(qpolynomial_list obj); - inline ~qpolynomial_list(); - inline __isl_give isl_qpolynomial_list *copy() const &; - inline __isl_give isl_qpolynomial_list *copy() && = delete; - inline __isl_keep isl_qpolynomial_list *get() const; - inline __isl_give isl_qpolynomial_list *release(); + inline /* implicit */ basic_map(); + inline /* implicit */ basic_map(const basic_map &obj); + inline explicit basic_map(isl::ctx ctx, const std::string &str); + inline basic_map &operator=(basic_map obj); + inline ~basic_map(); + inline __isl_give isl_basic_map *copy() const &; + inline __isl_give isl_basic_map *copy() && = delete; + inline __isl_keep isl_basic_map *get() const; + inline __isl_give isl_basic_map *release(); inline bool is_null() const; inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::qpolynomial_list add(isl::qpolynomial el) const; - static inline isl::qpolynomial_list alloc(isl::ctx ctx, int n); - inline isl::qpolynomial_list clear() const; - inline isl::qpolynomial_list concat(isl::qpolynomial_list list2) const; - inline isl::qpolynomial_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::qpolynomial_list from_qpolynomial(isl::qpolynomial el); - inline isl::qpolynomial get_at(int index) const; - inline isl::qpolynomial get_qpolynomial(int index) const; - inline isl::qpolynomial_list insert(unsigned int pos, isl::qpolynomial el) const; - inline isl_size n_qpolynomial() const; - inline isl::qpolynomial_list reverse() const; - inline isl::qpolynomial_list set_qpolynomial(int index, isl::qpolynomial el) const; - inline isl_size size() const; - inline isl::qpolynomial_list swap(unsigned int pos1, unsigned int pos2) const; -}; - -// declarations for isl::schedule -inline schedule manage(__isl_take isl_schedule *ptr); -inline schedule manage_copy(__isl_keep isl_schedule *ptr); -class schedule { - friend inline schedule manage(__isl_take isl_schedule *ptr); - friend inline schedule manage_copy(__isl_keep isl_schedule *ptr); - - isl_schedule *ptr = nullptr; - - inline explicit schedule(__isl_take isl_schedule *ptr); - -public: - inline /* implicit */ schedule(); - inline /* implicit */ schedule(const schedule &obj); - inline explicit schedule(isl::ctx ctx, const std::string &str); - inline schedule &operator=(schedule obj); - inline ~schedule(); - inline __isl_give isl_schedule *copy() const &; - inline __isl_give isl_schedule *copy() && = delete; - inline __isl_keep isl_schedule *get() const; - inline __isl_give isl_schedule *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::schedule align_params(isl::space space) const; - static inline isl::schedule empty(isl::space space); - static inline isl::schedule from_domain(isl::union_set domain); - inline isl::union_set get_domain() const; - inline isl::union_map get_map() const; - inline isl::schedule_node get_root() const; - inline isl::schedule gist_domain_params(isl::set context) const; - inline isl::schedule insert_context(isl::set context) const; - inline isl::schedule insert_guard(isl::set guard) const; - inline isl::schedule insert_partial_schedule(isl::multi_union_pw_aff partial) const; - inline isl::schedule intersect_domain(isl::union_set domain) const; - inline boolean plain_is_equal(const isl::schedule &schedule2) const; - inline isl::schedule pullback(isl::union_pw_multi_aff upma) const; - inline isl::schedule reset_user() const; - inline isl::schedule sequence(isl::schedule schedule2) const; + inline isl::map add_constraint(const isl::constraint &constraint) const; + inline isl::map add_dims(isl::dim type, unsigned int n) const; + inline isl::basic_map affine_hull() const; + inline isl::map align_params(const isl::space &model) const; + inline isl::basic_map apply_domain(isl::basic_map bmap2) const; + inline isl::map apply_domain(const isl::map &map2) const; + inline isl::union_map apply_domain(const isl::union_map &umap2) const; + inline isl::basic_map apply_range(isl::basic_map bmap2) const; + inline isl::map apply_range(const isl::map &map2) const; + inline isl::union_map apply_range(const isl::union_map &umap2) const; + inline isl::map as_map() const; + inline isl::multi_union_pw_aff as_multi_union_pw_aff() const; + inline isl::pw_multi_aff as_pw_multi_aff() const; + inline isl::union_pw_multi_aff as_union_pw_multi_aff() const; + inline isl::basic_map_list basic_map_list() const; + inline isl::set bind_domain(const isl::multi_id &tuple) const; + inline isl::set bind_range(const isl::multi_id &tuple) const; + inline boolean can_curry() const; + inline isl::map coalesce() const; + inline isl::map complement() const; + inline isl::union_map compute_divs() const; + inline isl::map curry() const; + inline isl::basic_set deltas() const; + inline isl::basic_map detect_equalities() const; + inline class size dim(isl::dim type) const; + inline isl::pw_aff dim_max(int pos) const; + inline isl::pw_aff dim_min(int pos) const; + inline isl::basic_set domain() const; + inline isl::map domain_factor_domain() const; + inline isl::map domain_factor_range() const; + inline isl::map domain_map() const; + inline isl::union_pw_multi_aff domain_map_union_pw_multi_aff() const; + inline isl::map domain_product(const isl::map &map2) const; + inline isl::union_map domain_product(const isl::union_map &umap2) const; + inline class size domain_tuple_dim() const; + inline isl::id domain_tuple_id() const; + inline isl::map eq_at(const isl::multi_pw_aff &mpa) const; + inline isl::union_map eq_at(const isl::multi_union_pw_aff &mupa) const; + static inline isl::basic_map equal(isl::space space, unsigned int n_equal); + inline isl::basic_map equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const; + inline boolean every_map(const std::function &test) const; + inline isl::map extract_map(const isl::space &space) const; + inline isl::map factor_domain() const; + inline isl::map factor_range() const; + inline isl::basic_map fix_si(isl::dim type, unsigned int pos, int value) const; + inline isl::basic_map fix_val(isl::dim type, unsigned int pos, isl::val v) const; + inline isl::basic_map fix_val(isl::dim type, unsigned int pos, long v) const; + inline isl::union_map fixed_power(const isl::val &exp) const; + inline isl::union_map fixed_power(long exp) const; + inline isl::map flat_range_product(const isl::map &map2) const; + inline isl::union_map flat_range_product(const isl::union_map &umap2) const; + inline isl::basic_map flatten() const; + inline isl::basic_map flatten_domain() const; + inline isl::basic_map flatten_range() const; + inline isl::map floordiv_val(const isl::val &d) const; + inline isl::map floordiv_val(long d) const; + inline stat foreach_basic_map(const std::function &fn) const; + inline stat foreach_map(const std::function &fn) const; + static inline isl::basic_map from_aff(isl::aff aff); + static inline isl::basic_map from_domain_and_range(isl::basic_set domain, isl::basic_set range); + inline isl::basic_map gist(isl::basic_map context) const; + inline isl::map gist(const isl::map &context) const; + inline isl::union_map gist(const isl::union_map &context) const; + inline isl::map gist_domain(const isl::set &context) const; + inline isl::union_map gist_domain(const isl::union_set &uset) const; + inline isl::map gist_params(const isl::set &context) const; + inline isl::union_map gist_range(const isl::union_set &uset) const; + inline boolean has_domain_tuple_id() const; + inline boolean has_equal_space(const isl::map &map2) const; + inline boolean has_range_tuple_id() const; + inline boolean has_tuple_id(isl::dim type) const; + inline boolean has_tuple_name(isl::dim type) const; + inline isl::basic_map intersect(isl::basic_map bmap2) const; + inline isl::map intersect(const isl::map &map2) const; + inline isl::union_map intersect(const isl::union_map &umap2) const; + inline isl::basic_map intersect_domain(isl::basic_set bset) const; + inline isl::map intersect_domain(const isl::set &set) const; + inline isl::union_map intersect_domain(const isl::space &space) const; + inline isl::union_map intersect_domain(const isl::union_set &uset) const; + inline isl::basic_map intersect_domain(const isl::point &bset) const; + inline isl::map intersect_domain_factor_domain(const isl::map &factor) const; + inline isl::union_map intersect_domain_factor_domain(const isl::union_map &factor) const; + inline isl::map intersect_domain_factor_range(const isl::map &factor) const; + inline isl::union_map intersect_domain_factor_range(const isl::union_map &factor) const; + inline isl::map intersect_params(const isl::set ¶ms) const; + inline isl::basic_map intersect_range(isl::basic_set bset) const; + inline isl::map intersect_range(const isl::set &set) const; + inline isl::union_map intersect_range(const isl::space &space) const; + inline isl::union_map intersect_range(const isl::union_set &uset) const; + inline isl::basic_map intersect_range(const isl::point &bset) const; + inline isl::map intersect_range_factor_domain(const isl::map &factor) const; + inline isl::union_map intersect_range_factor_domain(const isl::union_map &factor) const; + inline isl::map intersect_range_factor_range(const isl::map &factor) const; + inline isl::union_map intersect_range_factor_range(const isl::union_map &factor) const; + inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; + inline boolean is_bijective() const; + inline boolean is_disjoint(const isl::map &map2) const; + inline boolean is_disjoint(const isl::union_map &umap2) const; + inline boolean is_empty() const; + inline boolean is_equal(const isl::basic_map &bmap2) const; + inline boolean is_equal(const isl::map &map2) const; + inline boolean is_equal(const isl::union_map &umap2) const; + inline boolean is_injective() const; + inline boolean is_single_valued() const; + inline boolean is_strict_subset(const isl::map &map2) const; + inline boolean is_strict_subset(const isl::union_map &umap2) const; + inline boolean is_subset(const isl::basic_map &bmap2) const; + inline boolean is_subset(const isl::map &map2) const; + inline boolean is_subset(const isl::union_map &umap2) const; + inline boolean isa_map() const; + inline isl::map lex_ge_at(const isl::multi_pw_aff &mpa) const; + inline isl::map lex_gt_at(const isl::multi_pw_aff &mpa) const; + inline isl::map lex_le_at(const isl::multi_pw_aff &mpa) const; + inline isl::map lex_lt_at(const isl::multi_pw_aff &mpa) const; + inline isl::map lexmax() const; + inline isl::pw_multi_aff lexmax_pw_multi_aff() const; + inline isl::map lexmin() const; + inline isl::pw_multi_aff lexmin_pw_multi_aff() const; + inline isl::map lower_bound(const isl::multi_pw_aff &lower) const; + inline isl::map lower_bound_si(isl::dim type, unsigned int pos, int value) const; + inline isl::map_list map_list() const; + inline isl::multi_pw_aff max_multi_pw_aff() const; + inline isl::multi_pw_aff min_multi_pw_aff() const; + inline isl::map move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; + inline class size n_basic_map() const; + inline isl::map order_lt(isl::dim type1, int pos1, isl::dim type2, int pos2) const; + inline isl::set params() const; + inline isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const; + inline isl::basic_map polyhedral_hull() const; + inline isl::map preimage_domain(const isl::multi_aff &ma) const; + inline isl::map preimage_domain(const isl::multi_pw_aff &mpa) const; + inline isl::map preimage_domain(const isl::pw_multi_aff &pma) const; + inline isl::union_map preimage_domain(const isl::union_pw_multi_aff &upma) const; + inline isl::map preimage_range(const isl::multi_aff &ma) const; + inline isl::map preimage_range(const isl::pw_multi_aff &pma) const; + inline isl::union_map preimage_range(const isl::union_pw_multi_aff &upma) const; + inline isl::map product(const isl::map &map2) const; + inline isl::union_map product(const isl::union_map &umap2) const; + inline isl::map project_out(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::map project_out_all_params() const; + inline isl::set range() const; + inline isl::map range_factor_domain() const; + inline isl::map range_factor_range() const; + inline isl::fixed_box range_lattice_tile() const; + inline isl::map range_map() const; + inline isl::map range_product(const isl::map &map2) const; + inline isl::union_map range_product(const isl::union_map &umap2) const; + inline isl::map range_reverse() const; + inline isl::fixed_box range_simple_fixed_box_hull() const; + inline class size range_tuple_dim() const; + inline isl::id range_tuple_id() const; + inline isl::basic_map reverse() const; + inline isl::basic_map sample() const; + inline isl::map set_domain_tuple(const isl::id &id) const; + inline isl::map set_domain_tuple(const std::string &id) const; + inline isl::map set_range_tuple(const isl::id &id) const; + inline isl::map set_range_tuple(const std::string &id) const; + inline isl::map set_tuple_id(isl::dim type, const isl::id &id) const; + inline isl::map set_tuple_id(isl::dim type, const std::string &id) const; + inline isl::space space() const; + inline isl::map subtract(const isl::map &map2) const; + inline isl::union_map subtract(const isl::union_map &umap2) const; + inline isl::union_map subtract_domain(const isl::union_set &dom) const; + inline isl::union_map subtract_range(const isl::union_set &dom) const; + inline isl::map sum(const isl::map &map2) const; + inline isl::basic_map_list to_list() const; + inline isl::union_map to_union_map() const; + inline isl::id tuple_id(isl::dim type) const; + inline isl::map uncurry() const; + inline isl::map unite(isl::basic_map bmap2) const; + inline isl::map unite(const isl::map &map2) const; + inline isl::union_map unite(const isl::union_map &umap2) const; + static inline isl::basic_map universe(isl::space space); + inline isl::basic_map unshifted_simple_hull() const; + inline isl::map upper_bound(const isl::multi_pw_aff &upper) const; + inline isl::map upper_bound_si(isl::dim type, unsigned int pos, int value) const; + inline isl::set wrap() const; + inline isl::map zip() const; }; -// declarations for isl::schedule_constraints -inline schedule_constraints manage(__isl_take isl_schedule_constraints *ptr); -inline schedule_constraints manage_copy(__isl_keep isl_schedule_constraints *ptr); +// declarations for isl::basic_map_list +inline basic_map_list manage(__isl_take isl_basic_map_list *ptr); +inline basic_map_list manage_copy(__isl_keep isl_basic_map_list *ptr); -class schedule_constraints { - friend inline schedule_constraints manage(__isl_take isl_schedule_constraints *ptr); - friend inline schedule_constraints manage_copy(__isl_keep isl_schedule_constraints *ptr); +class basic_map_list { + friend inline basic_map_list manage(__isl_take isl_basic_map_list *ptr); + friend inline basic_map_list manage_copy(__isl_keep isl_basic_map_list *ptr); - isl_schedule_constraints *ptr = nullptr; +protected: + isl_basic_map_list *ptr = nullptr; - inline explicit schedule_constraints(__isl_take isl_schedule_constraints *ptr); + inline explicit basic_map_list(__isl_take isl_basic_map_list *ptr); public: - inline /* implicit */ schedule_constraints(); - inline /* implicit */ schedule_constraints(const schedule_constraints &obj); - inline explicit schedule_constraints(isl::ctx ctx, const std::string &str); - inline schedule_constraints &operator=(schedule_constraints obj); - inline ~schedule_constraints(); - inline __isl_give isl_schedule_constraints *copy() const &; - inline __isl_give isl_schedule_constraints *copy() && = delete; - inline __isl_keep isl_schedule_constraints *get() const; - inline __isl_give isl_schedule_constraints *release(); + inline /* implicit */ basic_map_list(); + inline /* implicit */ basic_map_list(const basic_map_list &obj); + inline explicit basic_map_list(isl::ctx ctx, int n); + inline explicit basic_map_list(isl::basic_map el); + inline basic_map_list &operator=(basic_map_list obj); + inline ~basic_map_list(); + inline __isl_give isl_basic_map_list *copy() const &; + inline __isl_give isl_basic_map_list *copy() && = delete; + inline __isl_keep isl_basic_map_list *get() const; + inline __isl_give isl_basic_map_list *release(); inline bool is_null() const; inline isl::ctx ctx() const; - inline void dump() const; - inline isl::schedule_constraints apply(isl::union_map umap) const; - inline isl::schedule compute_schedule() const; - inline isl::union_map get_coincidence() const; - inline isl::union_map get_conditional_validity() const; - inline isl::union_map get_conditional_validity_condition() const; - inline isl::set get_context() const; - inline isl::union_set get_domain() const; - inline isl::union_map get_proximity() const; - inline isl::union_map get_validity() const; - static inline isl::schedule_constraints on_domain(isl::union_set domain); - inline isl::schedule_constraints set_coincidence(isl::union_map coincidence) const; - inline isl::schedule_constraints set_conditional_validity(isl::union_map condition, isl::union_map validity) const; - inline isl::schedule_constraints set_context(isl::set context) const; - inline isl::schedule_constraints set_proximity(isl::union_map proximity) const; - inline isl::schedule_constraints set_validity(isl::union_map validity) const; + inline isl::basic_map_list add(isl::basic_map el) const; + inline isl::basic_map at(int index) const; + inline isl::basic_map get_at(int index) const; + inline isl::basic_map_list clear() const; + inline isl::basic_map_list concat(isl::basic_map_list list2) const; + inline isl::basic_map_list drop(unsigned int first, unsigned int n) const; + inline stat foreach(const std::function &fn) const; + inline isl::basic_map_list insert(unsigned int pos, isl::basic_map el) const; + inline class size size() const; }; -// declarations for isl::schedule_node -inline schedule_node manage(__isl_take isl_schedule_node *ptr); -inline schedule_node manage_copy(__isl_keep isl_schedule_node *ptr); +// declarations for isl::basic_set +inline basic_set manage(__isl_take isl_basic_set *ptr); +inline basic_set manage_copy(__isl_keep isl_basic_set *ptr); -class schedule_node { - friend inline schedule_node manage(__isl_take isl_schedule_node *ptr); - friend inline schedule_node manage_copy(__isl_keep isl_schedule_node *ptr); +class basic_set { + friend inline basic_set manage(__isl_take isl_basic_set *ptr); + friend inline basic_set manage_copy(__isl_keep isl_basic_set *ptr); - isl_schedule_node *ptr = nullptr; +protected: + isl_basic_set *ptr = nullptr; - inline explicit schedule_node(__isl_take isl_schedule_node *ptr); + inline explicit basic_set(__isl_take isl_basic_set *ptr); public: - inline /* implicit */ schedule_node(); - inline /* implicit */ schedule_node(const schedule_node &obj); - inline schedule_node &operator=(schedule_node obj); - inline ~schedule_node(); - inline __isl_give isl_schedule_node *copy() const &; - inline __isl_give isl_schedule_node *copy() && = delete; - inline __isl_keep isl_schedule_node *get() const; - inline __isl_give isl_schedule_node *release(); + inline /* implicit */ basic_set(); + inline /* implicit */ basic_set(const basic_set &obj); + inline /* implicit */ basic_set(isl::point pnt); + inline explicit basic_set(isl::ctx ctx, const std::string &str); + inline basic_set &operator=(basic_set obj); + inline ~basic_set(); + inline __isl_give isl_basic_set *copy() const &; + inline __isl_give isl_basic_set *copy() && = delete; + inline __isl_keep isl_basic_set *get() const; + inline __isl_give isl_basic_set *release(); inline bool is_null() const; inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::schedule_node align_params(isl::space space) const; - inline isl::schedule_node ancestor(int generation) const; - inline boolean band_member_get_coincident(int pos) const; - inline isl::schedule_node band_member_set_coincident(int pos, int coincident) const; - inline isl::schedule_node band_set_ast_build_options(isl::union_set options) const; - inline isl::schedule_node child(int pos) const; - inline isl::set context_get_context() const; - inline isl::schedule_node cut() const; - inline isl::union_set domain_get_domain() const; - inline isl::union_pw_multi_aff expansion_get_contraction() const; - inline isl::union_map expansion_get_expansion() const; - inline isl::union_map extension_get_extension() const; - inline isl::union_set filter_get_filter() const; - inline isl::schedule_node first_child() const; - inline stat foreach_ancestor_top_down(const std::function &fn) const; - static inline isl::schedule_node from_domain(isl::union_set domain); - static inline isl::schedule_node from_extension(isl::union_map extension); - inline isl_size get_ancestor_child_position(const isl::schedule_node &ancestor) const; - inline isl::schedule_node get_child(int pos) const; - inline isl_size get_child_position() const; - inline isl::union_set get_domain() const; - inline isl::multi_union_pw_aff get_prefix_schedule_multi_union_pw_aff() const; - inline isl::union_map get_prefix_schedule_relation() const; - inline isl::union_map get_prefix_schedule_union_map() const; - inline isl::union_pw_multi_aff get_prefix_schedule_union_pw_multi_aff() const; - inline isl::schedule get_schedule() const; - inline isl_size get_schedule_depth() const; - inline isl::schedule_node get_shared_ancestor(const isl::schedule_node &node2) const; - inline isl::union_pw_multi_aff get_subtree_contraction() const; - inline isl::union_map get_subtree_expansion() const; - inline isl::union_map get_subtree_schedule_union_map() const; - inline isl_size get_tree_depth() const; - inline isl::union_set get_universe_domain() const; - inline isl::schedule_node graft_after(isl::schedule_node graft) const; - inline isl::schedule_node graft_before(isl::schedule_node graft) const; - inline isl::schedule_node group(isl::id group_id) const; - inline isl::set guard_get_guard() const; - inline boolean has_children() const; - inline boolean has_next_sibling() const; - inline boolean has_parent() const; - inline boolean has_previous_sibling() const; - inline isl::schedule_node insert_context(isl::set context) const; - inline isl::schedule_node insert_filter(isl::union_set filter) const; - inline isl::schedule_node insert_guard(isl::set context) const; - inline isl::schedule_node insert_mark(isl::id mark) const; - inline isl::schedule_node insert_partial_schedule(isl::multi_union_pw_aff schedule) const; - inline isl::schedule_node insert_sequence(isl::union_set_list filters) const; - inline isl::schedule_node insert_set(isl::union_set_list filters) const; - inline boolean is_equal(const isl::schedule_node &node2) const; - inline boolean is_subtree_anchored() const; - inline isl::id mark_get_id() const; - inline isl_size n_children() const; - inline isl::schedule_node next_sibling() const; - inline isl::schedule_node order_after(isl::union_set filter) const; - inline isl::schedule_node order_before(isl::union_set filter) const; - inline isl::schedule_node parent() const; - inline isl::schedule_node previous_sibling() const; - inline isl::schedule_node reset_user() const; - inline isl::schedule_node root() const; - inline isl::schedule_node sequence_splice_child(int pos) const; -}; - -// declarations for isl::set -inline set manage(__isl_take isl_set *ptr); -inline set manage_copy(__isl_keep isl_set *ptr); - -class set { - friend inline set manage(__isl_take isl_set *ptr); - friend inline set manage_copy(__isl_keep isl_set *ptr); - - isl_set *ptr = nullptr; - inline explicit set(__isl_take isl_set *ptr); - -public: - inline /* implicit */ set(); - inline /* implicit */ set(const set &obj); - inline /* implicit */ set(isl::basic_set bset); - inline /* implicit */ set(isl::point pnt); - inline explicit set(isl::union_set uset); - inline explicit set(isl::ctx ctx, const std::string &str); - inline set &operator=(set obj); - inline ~set(); - inline __isl_give isl_set *copy() const &; - inline __isl_give isl_set *copy() && = delete; - inline __isl_keep isl_set *get() const; - inline __isl_give isl_set *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::set add_constraint(isl::constraint constraint) const; + inline isl::set add_constraint(const isl::constraint &constraint) const; inline isl::set add_dims(isl::dim type, unsigned int n) const; inline isl::basic_set affine_hull() const; - inline isl::set align_params(isl::space model) const; - inline isl::set apply(isl::map map) const; - inline isl::set bind(isl::multi_id tuple) const; - inline isl::basic_set bounded_simple_hull() const; - static inline isl::set box_from_points(isl::point pnt1, isl::point pnt2); + inline isl::set align_params(const isl::space &model) const; + inline isl::basic_set apply(isl::basic_map bmap) const; + inline isl::set apply(const isl::map &map) const; + inline isl::union_set apply(const isl::union_map &umap) const; + inline isl::pw_multi_aff as_pw_multi_aff() const; + inline isl::set as_set() const; + inline isl::basic_set_list basic_set_list() const; + inline isl::set bind(const isl::multi_id &tuple) const; inline isl::set coalesce() const; - inline isl::basic_set coefficients() const; inline isl::set complement() const; + inline isl::union_set compute_divs() const; + inline boolean contains(const isl::space &space) const; inline isl::basic_set convex_hull() const; - inline isl::val count_val() const; - inline isl::set detect_equalities() const; - inline isl_size dim(isl::dim type) const; + inline isl::basic_set detect_equalities() const; + inline class size dim(isl::dim type) const; inline boolean dim_has_any_lower_bound(isl::dim type, unsigned int pos) const; - inline boolean dim_has_any_upper_bound(isl::dim type, unsigned int pos) const; - inline boolean dim_has_lower_bound(isl::dim type, unsigned int pos) const; - inline boolean dim_has_upper_bound(isl::dim type, unsigned int pos) const; - inline boolean dim_is_bounded(isl::dim type, unsigned int pos) const; + inline isl::id dim_id(isl::dim type, unsigned int pos) const; inline isl::pw_aff dim_max(int pos) const; inline isl::val dim_max_val(int pos) const; inline isl::pw_aff dim_min(int pos) const; inline isl::val dim_min_val(int pos) const; + inline std::string dim_name(isl::dim type, unsigned int pos) const; + inline isl::aff div(int pos) const; + inline isl::aff get_div(int pos) const; inline isl::set drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::set drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::set drop_unused_params() const; inline isl::set eliminate(isl::dim type, unsigned int first, unsigned int n) const; - static inline isl::set empty(isl::space space); - inline isl::set equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const; + inline boolean every_set(const std::function &test) const; + inline isl::set extract_set(const isl::space &space) const; inline int find_dim_by_id(isl::dim type, const isl::id &id) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::set fix_si(isl::dim type, unsigned int pos, int value) const; - inline isl::set fix_val(isl::dim type, unsigned int pos, isl::val v) const; - inline isl::set flat_product(isl::set set2) const; - inline isl::set flatten() const; - inline isl::map flatten_map() const; - inline int follows_at(const isl::set &set2, int pos) const; - inline stat foreach_basic_set(const std::function &fn) const; - inline stat foreach_point(const std::function &fn) const; - static inline isl::set from_multi_aff(isl::multi_aff ma); - static inline isl::set from_multi_pw_aff(isl::multi_pw_aff mpa); - inline isl::set from_params() const; - static inline isl::set from_pw_aff(isl::pw_aff pwaff); - static inline isl::set from_pw_multi_aff(isl::pw_multi_aff pma); - inline isl::basic_set_list get_basic_set_list() const; - inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; - inline std::string get_dim_name(isl::dim type, unsigned int pos) const; - inline isl::multi_val get_plain_multi_val_if_fixed() const; - inline isl::fixed_box get_simple_fixed_box_hull() const; - inline isl::space get_space() const; - inline isl::val get_stride(int pos) const; - inline isl::id get_tuple_id() const; - inline std::string get_tuple_name() const; - inline isl::set gist(isl::set context) const; - inline isl::set gist_basic_set(isl::basic_set context) const; - inline isl::set gist_params(isl::set context) const; - inline boolean has_dim_id(isl::dim type, unsigned int pos) const; - inline boolean has_dim_name(isl::dim type, unsigned int pos) const; + inline int find_dim_by_id(isl::dim type, const std::string &id) const; + inline isl::basic_set fix_si(isl::dim type, unsigned int pos, int value) const; + inline isl::basic_set fix_val(isl::dim type, unsigned int pos, isl::val v) const; + inline isl::basic_set fix_val(isl::dim type, unsigned int pos, long v) const; + inline isl::basic_set flatten() const; + inline stat foreach_basic_set(const std::function &fn) const; + inline stat foreach_point(const std::function &fn) const; + inline stat foreach_set(const std::function &fn) const; + inline isl::basic_set gist(isl::basic_set context) const; + inline isl::set gist(const isl::set &context) const; + inline isl::union_set gist(const isl::union_set &context) const; + inline isl::basic_set gist(const isl::point &context) const; + inline isl::set gist_params(const isl::set &context) const; inline boolean has_equal_space(const isl::set &set2) const; - inline boolean has_tuple_id() const; - inline boolean has_tuple_name() const; inline isl::map identity() const; + inline isl::union_pw_multi_aff identity_union_pw_multi_aff() const; inline isl::pw_aff indicator_function() const; inline isl::set insert_dims(isl::dim type, unsigned int pos, unsigned int n) const; - inline isl::map insert_domain(isl::space domain) const; - inline isl::set intersect(isl::set set2) const; - inline isl::set intersect_factor_domain(isl::set domain) const; - inline isl::set intersect_factor_range(isl::set range) const; - inline isl::set intersect_params(isl::set params) const; + inline isl::map insert_domain(const isl::space &domain) const; + inline isl::basic_set intersect(isl::basic_set bset2) const; + inline isl::set intersect(const isl::set &set2) const; + inline isl::union_set intersect(const isl::union_set &uset2) const; + inline isl::basic_set intersect(const isl::point &bset2) const; + inline isl::basic_set intersect_params(isl::basic_set bset2) const; + inline isl::set intersect_params(const isl::set ¶ms) const; + inline isl::basic_set intersect_params(const isl::point &bset2) const; inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; inline boolean involves_locals() const; inline boolean is_bounded() const; - inline boolean is_box() const; inline boolean is_disjoint(const isl::set &set2) const; + inline boolean is_disjoint(const isl::union_set &uset2) const; inline boolean is_empty() const; + inline boolean is_equal(const isl::basic_set &bset2) const; inline boolean is_equal(const isl::set &set2) const; + inline boolean is_equal(const isl::union_set &uset2) const; + inline boolean is_equal(const isl::point &bset2) const; inline boolean is_params() const; inline boolean is_singleton() const; inline boolean is_strict_subset(const isl::set &set2) const; + inline boolean is_strict_subset(const isl::union_set &uset2) const; + inline boolean is_subset(const isl::basic_set &bset2) const; inline boolean is_subset(const isl::set &set2) const; + inline boolean is_subset(const isl::union_set &uset2) const; + inline boolean is_subset(const isl::point &bset2) const; inline boolean is_wrapping() const; - inline isl::map lex_ge_set(isl::set set2) const; - inline isl::map lex_gt_set(isl::set set2) const; - inline isl::map lex_lt_set(isl::set set2) const; + inline boolean isa_set() const; inline isl::set lexmax() const; inline isl::pw_multi_aff lexmax_pw_multi_aff() const; inline isl::set lexmin() const; inline isl::pw_multi_aff lexmin_pw_multi_aff() const; - inline isl::set lower_bound(isl::multi_pw_aff lower) const; - inline isl::set lower_bound(isl::multi_val lower) const; + inline isl::set lower_bound(const isl::multi_pw_aff &lower) const; + inline isl::set lower_bound(const isl::multi_val &lower) const; inline isl::set lower_bound_si(isl::dim type, unsigned int pos, int value) const; - inline isl::set lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const; + inline isl::set lower_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const; + inline isl::set lower_bound_val(isl::dim type, unsigned int pos, long value) const; inline isl::multi_pw_aff max_multi_pw_aff() const; inline isl::val max_val(const isl::aff &obj) const; inline isl::multi_pw_aff min_multi_pw_aff() const; inline isl::val min_val(const isl::aff &obj) const; - inline isl::set move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; - inline isl_size n_basic_set() const; - inline isl_size n_dim() const; - static inline isl::set nat_universe(isl::space space); - inline isl::set neg() const; - inline isl::set params() const; - inline int plain_cmp(const isl::set &set2) const; + inline class size n_basic_set() const; + inline isl::basic_set params() const; inline isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const; - inline boolean plain_is_disjoint(const isl::set &set2) const; - inline boolean plain_is_empty() const; - inline boolean plain_is_equal(const isl::set &set2) const; - inline boolean plain_is_universe() const; - inline isl::basic_set plain_unshifted_simple_hull() const; + inline isl::multi_val plain_multi_val_if_fixed() const; inline isl::basic_set polyhedral_hull() const; - inline isl::set preimage(isl::multi_aff ma) const; - inline isl::set preimage(isl::multi_pw_aff mpa) const; - inline isl::set preimage(isl::pw_multi_aff pma) const; - inline isl::set product(isl::set set2) const; - inline isl::map project_onto_map(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::set project_out(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::set preimage(const isl::multi_aff &ma) const; + inline isl::set preimage(const isl::multi_pw_aff &mpa) const; + inline isl::set preimage(const isl::pw_multi_aff &pma) const; + inline isl::union_set preimage(const isl::union_pw_multi_aff &upma) const; + inline isl::set product(const isl::set &set2) const; + inline isl::basic_set project_out(isl::dim type, unsigned int first, unsigned int n) const; inline isl::set project_out_all_params() const; - inline isl::set project_out_param(isl::id id) const; - inline isl::set project_out_param(isl::id_list list) const; + inline isl::set project_out_param(const isl::id &id) const; + inline isl::set project_out_param(const std::string &id) const; + inline isl::set project_out_param(const isl::id_list &list) const; + inline isl::pw_multi_aff pw_multi_aff_on_domain(const isl::multi_val &mv) const; inline isl::set remove_dims(isl::dim type, unsigned int first, unsigned int n) const; inline isl::set remove_divs() const; - inline isl::set remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; inline isl::set remove_redundancies() const; - inline isl::set remove_unknown_divs() const; - inline isl::set reset_space(isl::space space) const; inline isl::set reset_tuple_id() const; - inline isl::set reset_user() const; inline isl::basic_set sample() const; inline isl::point sample_point() const; - inline isl::set set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; - inline isl::set set_tuple_id(isl::id id) const; - inline isl::set set_tuple_name(const std::string &s) const; + inline isl::set set_dim_id(isl::dim type, unsigned int pos, const isl::id &id) const; + inline isl::set set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const; + inline isl::set_list set_list() const; + inline isl::set set_tuple_id(const isl::id &id) const; + inline isl::set set_tuple_id(const std::string &id) const; + inline isl::fixed_box simple_fixed_box_hull() const; inline isl::basic_set simple_hull() const; - inline int size() const; - inline isl::basic_set solutions() const; - inline isl::set split_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::set subtract(isl::set set2) const; - inline isl::set sum(isl::set set2) const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::val stride(int pos) const; + inline isl::set subtract(const isl::set &set2) const; + inline isl::union_set subtract(const isl::union_set &uset2) const; + inline isl::basic_set_list to_list() const; + inline isl::set to_set() const; + inline isl::union_set to_union_set() const; inline isl::map translation() const; - inline isl_size tuple_dim() const; - inline isl::set unbind_params(isl::multi_id tuple) const; - inline isl::map unbind_params_insert_domain(isl::multi_id domain) const; - inline isl::set unite(isl::set set2) const; - static inline isl::set universe(isl::space space); + inline class size tuple_dim() const; + inline isl::id tuple_id() const; + inline std::string tuple_name() const; + inline isl::set unbind_params(const isl::multi_id &tuple) const; + inline isl::map unbind_params_insert_domain(const isl::multi_id &domain) const; + inline isl::set unite(isl::basic_set bset2) const; + inline isl::set unite(const isl::set &set2) const; + inline isl::union_set unite(const isl::union_set &uset2) const; + inline isl::set unite(const isl::point &bset2) const; + static inline isl::basic_set universe(isl::space space); inline isl::basic_set unshifted_simple_hull() const; - inline isl::basic_set unshifted_simple_hull_from_set_list(isl::set_list list) const; inline isl::map unwrap() const; - inline isl::set upper_bound(isl::multi_pw_aff upper) const; - inline isl::set upper_bound(isl::multi_val upper) const; - inline isl::set upper_bound_si(isl::dim type, unsigned int pos, int value) const; - inline isl::set upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const; - inline isl::map wrapped_domain_map() const; + inline isl::set upper_bound(const isl::multi_pw_aff &upper) const; + inline isl::set upper_bound(const isl::multi_val &upper) const; + inline isl::set upper_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const; + inline isl::set upper_bound_val(isl::dim type, unsigned int pos, long value) const; }; -// declarations for isl::set_list -inline set_list manage(__isl_take isl_set_list *ptr); -inline set_list manage_copy(__isl_keep isl_set_list *ptr); +// declarations for isl::basic_set_list +inline basic_set_list manage(__isl_take isl_basic_set_list *ptr); +inline basic_set_list manage_copy(__isl_keep isl_basic_set_list *ptr); -class set_list { - friend inline set_list manage(__isl_take isl_set_list *ptr); - friend inline set_list manage_copy(__isl_keep isl_set_list *ptr); +class basic_set_list { + friend inline basic_set_list manage(__isl_take isl_basic_set_list *ptr); + friend inline basic_set_list manage_copy(__isl_keep isl_basic_set_list *ptr); - isl_set_list *ptr = nullptr; +protected: + isl_basic_set_list *ptr = nullptr; - inline explicit set_list(__isl_take isl_set_list *ptr); + inline explicit basic_set_list(__isl_take isl_basic_set_list *ptr); public: - inline /* implicit */ set_list(); - inline /* implicit */ set_list(const set_list &obj); - inline set_list &operator=(set_list obj); - inline ~set_list(); - inline __isl_give isl_set_list *copy() const &; - inline __isl_give isl_set_list *copy() && = delete; - inline __isl_keep isl_set_list *get() const; - inline __isl_give isl_set_list *release(); + inline /* implicit */ basic_set_list(); + inline /* implicit */ basic_set_list(const basic_set_list &obj); + inline explicit basic_set_list(isl::ctx ctx, int n); + inline explicit basic_set_list(isl::basic_set el); + inline basic_set_list &operator=(basic_set_list obj); + inline ~basic_set_list(); + inline __isl_give isl_basic_set_list *copy() const &; + inline __isl_give isl_basic_set_list *copy() && = delete; + inline __isl_keep isl_basic_set_list *get() const; + inline __isl_give isl_basic_set_list *release(); inline bool is_null() const; inline isl::ctx ctx() const; - inline void dump() const; - inline isl::set_list add(isl::set el) const; - static inline isl::set_list alloc(isl::ctx ctx, int n); - inline isl::set_list clear() const; - inline isl::set_list concat(isl::set_list list2) const; - inline isl::set_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::set_list from_set(isl::set el); - inline isl::set get_at(int index) const; - inline isl::set get_set(int index) const; - inline isl::set_list insert(unsigned int pos, isl::set el) const; - inline isl_size n_set() const; - inline isl::set_list reverse() const; - inline isl::set_list set_set(int index, isl::set el) const; - inline isl_size size() const; - inline isl::set_list swap(unsigned int pos1, unsigned int pos2) const; - inline isl::set unite() const; + inline isl::basic_set_list add(isl::basic_set el) const; + inline isl::basic_set at(int index) const; + inline isl::basic_set get_at(int index) const; + inline isl::basic_set_list clear() const; + inline isl::basic_set_list concat(isl::basic_set_list list2) const; + inline isl::basic_set_list drop(unsigned int first, unsigned int n) const; + inline stat foreach(const std::function &fn) const; + inline isl::basic_set_list insert(unsigned int pos, isl::basic_set el) const; + inline class size size() const; }; -// declarations for isl::space -inline space manage(__isl_take isl_space *ptr); -inline space manage_copy(__isl_keep isl_space *ptr); +// declarations for isl::constraint +inline constraint manage(__isl_take isl_constraint *ptr); +inline constraint manage_copy(__isl_keep isl_constraint *ptr); -class space { - friend inline space manage(__isl_take isl_space *ptr); - friend inline space manage_copy(__isl_keep isl_space *ptr); +class constraint { + friend inline constraint manage(__isl_take isl_constraint *ptr); + friend inline constraint manage_copy(__isl_keep isl_constraint *ptr); - isl_space *ptr = nullptr; +protected: + isl_constraint *ptr = nullptr; - inline explicit space(__isl_take isl_space *ptr); + inline explicit constraint(__isl_take isl_constraint *ptr); public: - inline /* implicit */ space(); - inline /* implicit */ space(const space &obj); - inline explicit space(isl::ctx ctx, unsigned int nparam, unsigned int n_in, unsigned int n_out); - inline explicit space(isl::ctx ctx, unsigned int nparam, unsigned int dim); - inline space &operator=(space obj); - inline ~space(); - inline __isl_give isl_space *copy() const &; - inline __isl_give isl_space *copy() && = delete; - inline __isl_keep isl_space *get() const; - inline __isl_give isl_space *release(); + inline /* implicit */ constraint(); + inline /* implicit */ constraint(const constraint &obj); + inline constraint &operator=(constraint obj); + inline ~constraint(); + inline __isl_give isl_constraint *copy() const &; + inline __isl_give isl_constraint *copy() && = delete; + inline __isl_keep isl_constraint *get() const; + inline __isl_give isl_constraint *release(); inline bool is_null() const; inline isl::ctx ctx() const; - inline void dump() const; - inline isl::space add_dims(isl::dim type, unsigned int n) const; - inline isl::space add_named_tuple(isl::id tuple_id, unsigned int dim) const; - inline isl::space add_param_id(isl::id id) const; - inline isl::space add_unnamed_tuple(unsigned int dim) const; - inline isl::space align_params(isl::space space2) const; - inline boolean can_curry() const; - inline boolean can_range_curry() const; - inline boolean can_uncurry() const; - inline boolean can_zip() const; - inline isl::space curry() const; - inline isl_size dim(isl::dim type) const; - inline isl::space domain() const; - inline isl::space domain_factor_domain() const; - inline isl::space domain_factor_range() const; - inline boolean domain_is_wrapping() const; - inline isl::space domain_map() const; - inline isl::space domain_product(isl::space right) const; - inline isl::space drop_all_params() const; - inline isl::space drop_dims(isl::dim type, unsigned int first, unsigned int num) const; - inline isl::space factor_domain() const; - inline isl::space factor_range() const; - inline int find_dim_by_id(isl::dim type, const isl::id &id) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::space flatten_domain() const; - inline isl::space flatten_range() const; - inline isl::space from_domain() const; - inline isl::space from_range() const; - inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; - inline std::string get_dim_name(isl::dim type, unsigned int pos) const; - inline isl::id get_tuple_id(isl::dim type) const; - inline std::string get_tuple_name(isl::dim type) const; - inline boolean has_dim_id(isl::dim type, unsigned int pos) const; - inline boolean has_dim_name(isl::dim type, unsigned int pos) const; - inline boolean has_equal_params(const isl::space &space2) const; - inline boolean has_equal_tuples(const isl::space &space2) const; - inline boolean has_tuple_id(isl::dim type) const; - inline boolean has_tuple_name(isl::dim type) const; - inline isl::space insert_dims(isl::dim type, unsigned int pos, unsigned int n) const; - inline boolean is_domain(const isl::space &space2) const; - inline boolean is_equal(const isl::space &space2) const; - inline boolean is_map() const; - inline boolean is_params() const; - inline boolean is_product() const; - inline boolean is_range(const isl::space &space2) const; - inline boolean is_set() const; - inline boolean is_wrapping() const; - inline isl::space join(isl::space right) const; - inline isl::space map_from_domain_and_range(isl::space range) const; - inline isl::space map_from_set() const; - inline isl::space move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; - inline isl::space params() const; - static inline isl::space params_alloc(isl::ctx ctx, unsigned int nparam); - inline isl::space product(isl::space right) const; - inline isl::space range() const; - inline isl::space range_curry() const; - inline isl::space range_factor_domain() const; - inline isl::space range_factor_range() const; - inline boolean range_is_wrapping() const; - inline isl::space range_map() const; - inline isl::space range_product(isl::space right) const; - inline isl::space range_reverse() const; - inline isl::space reset_tuple_id(isl::dim type) const; - inline isl::space reset_user() const; - inline isl::space reverse() const; - inline isl::space set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; - inline isl::space set_from_params() const; - inline isl::space set_tuple_id(isl::dim type, isl::id id) const; - inline isl::space set_tuple_name(isl::dim type, const std::string &s) const; - inline boolean tuple_is_equal(isl::dim type1, const isl::space &space2, isl::dim type2) const; - inline isl::space uncurry() const; - static inline isl::space unit(isl::ctx ctx); - inline isl::space unwrap() const; - inline isl::space wrap() const; - inline isl::space zip() const; + static inline isl::constraint alloc_equality(isl::local_space ls); + static inline isl::constraint alloc_inequality(isl::local_space ls); + inline isl::constraint set_coefficient_si(isl::dim type, int pos, int v) const; + inline isl::constraint set_constant_si(int v) const; + inline isl::constraint set_constant_val(isl::val v) const; + inline isl::constraint set_constant_val(long v) const; +}; + +// declarations for isl::fixed_box +inline fixed_box manage(__isl_take isl_fixed_box *ptr); +inline fixed_box manage_copy(__isl_keep isl_fixed_box *ptr); + +class fixed_box { + friend inline fixed_box manage(__isl_take isl_fixed_box *ptr); + friend inline fixed_box manage_copy(__isl_keep isl_fixed_box *ptr); + +protected: + isl_fixed_box *ptr = nullptr; + + inline explicit fixed_box(__isl_take isl_fixed_box *ptr); + +public: + inline /* implicit */ fixed_box(); + inline /* implicit */ fixed_box(const fixed_box &obj); + inline fixed_box &operator=(fixed_box obj); + inline ~fixed_box(); + inline __isl_give isl_fixed_box *copy() const &; + inline __isl_give isl_fixed_box *copy() && = delete; + inline __isl_keep isl_fixed_box *get() const; + inline __isl_give isl_fixed_box *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline boolean is_valid() const; + inline isl::multi_aff offset() const; + inline isl::multi_aff get_offset() const; + inline isl::multi_val size() const; + inline isl::multi_val get_size() const; + inline isl::space space() const; + inline isl::space get_space() const; }; -// declarations for isl::term -inline term manage(__isl_take isl_term *ptr); -inline term manage_copy(__isl_keep isl_term *ptr); +// declarations for isl::id +inline id manage(__isl_take isl_id *ptr); +inline id manage_copy(__isl_keep isl_id *ptr); + +class id { + friend inline id manage(__isl_take isl_id *ptr); + friend inline id manage_copy(__isl_keep isl_id *ptr); + +protected: + isl_id *ptr = nullptr; + + inline explicit id(__isl_take isl_id *ptr); + +public: + inline /* implicit */ id(); + inline /* implicit */ id(const id &obj); + inline explicit id(isl::ctx ctx, const std::string &str); + inline id &operator=(id obj); + inline ~id(); + inline __isl_give isl_id *copy() const &; + inline __isl_give isl_id *copy() && = delete; + inline __isl_keep isl_id *get() const; + inline __isl_give isl_id *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + static inline isl::id alloc(isl::ctx ctx, const std::string &name, void * user); + inline std::string name() const; + inline std::string get_name() const; + inline isl::id_list to_list() const; + inline void * user() const; + inline void * get_user() const; +}; + +// declarations for isl::id_list +inline id_list manage(__isl_take isl_id_list *ptr); +inline id_list manage_copy(__isl_keep isl_id_list *ptr); + +class id_list { + friend inline id_list manage(__isl_take isl_id_list *ptr); + friend inline id_list manage_copy(__isl_keep isl_id_list *ptr); + +protected: + isl_id_list *ptr = nullptr; + + inline explicit id_list(__isl_take isl_id_list *ptr); + +public: + inline /* implicit */ id_list(); + inline /* implicit */ id_list(const id_list &obj); + inline explicit id_list(isl::ctx ctx, int n); + inline explicit id_list(isl::id el); + inline explicit id_list(isl::ctx ctx, const std::string &str); + inline id_list &operator=(id_list obj); + inline ~id_list(); + inline __isl_give isl_id_list *copy() const &; + inline __isl_give isl_id_list *copy() && = delete; + inline __isl_keep isl_id_list *get() const; + inline __isl_give isl_id_list *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::id_list add(isl::id el) const; + inline isl::id_list add(const std::string &el) const; + inline isl::id at(int index) const; + inline isl::id get_at(int index) const; + inline isl::id_list clear() const; + inline isl::id_list concat(isl::id_list list2) const; + inline isl::id_list drop(unsigned int first, unsigned int n) const; + inline stat foreach(const std::function &fn) const; + inline isl::id_list insert(unsigned int pos, isl::id el) const; + inline isl::id_list insert(unsigned int pos, const std::string &el) const; + inline class size size() const; +}; + +// declarations for isl::id_to_ast_expr +inline id_to_ast_expr manage(__isl_take isl_id_to_ast_expr *ptr); +inline id_to_ast_expr manage_copy(__isl_keep isl_id_to_ast_expr *ptr); + +class id_to_ast_expr { + friend inline id_to_ast_expr manage(__isl_take isl_id_to_ast_expr *ptr); + friend inline id_to_ast_expr manage_copy(__isl_keep isl_id_to_ast_expr *ptr); + +protected: + isl_id_to_ast_expr *ptr = nullptr; + + inline explicit id_to_ast_expr(__isl_take isl_id_to_ast_expr *ptr); + +public: + inline /* implicit */ id_to_ast_expr(); + inline /* implicit */ id_to_ast_expr(const id_to_ast_expr &obj); + inline id_to_ast_expr &operator=(id_to_ast_expr obj); + inline ~id_to_ast_expr(); + inline __isl_give isl_id_to_ast_expr *copy() const &; + inline __isl_give isl_id_to_ast_expr *copy() && = delete; + inline __isl_keep isl_id_to_ast_expr *get() const; + inline __isl_give isl_id_to_ast_expr *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + static inline isl::id_to_ast_expr alloc(isl::ctx ctx, int min_size); + inline isl::id_to_ast_expr set(isl::id key, isl::ast_expr val) const; + inline isl::id_to_ast_expr set(const std::string &key, const isl::ast_expr &val) const; +}; + +// declarations for isl::local_space +inline local_space manage(__isl_take isl_local_space *ptr); +inline local_space manage_copy(__isl_keep isl_local_space *ptr); + +class local_space { + friend inline local_space manage(__isl_take isl_local_space *ptr); + friend inline local_space manage_copy(__isl_keep isl_local_space *ptr); + +protected: + isl_local_space *ptr = nullptr; + + inline explicit local_space(__isl_take isl_local_space *ptr); + +public: + inline /* implicit */ local_space(); + inline /* implicit */ local_space(const local_space &obj); + inline explicit local_space(isl::space space); + inline local_space &operator=(local_space obj); + inline ~local_space(); + inline __isl_give isl_local_space *copy() const &; + inline __isl_give isl_local_space *copy() && = delete; + inline __isl_keep isl_local_space *get() const; + inline __isl_give isl_local_space *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + +}; + +// declarations for isl::map +inline map manage(__isl_take isl_map *ptr); +inline map manage_copy(__isl_keep isl_map *ptr); + +class map { + friend inline map manage(__isl_take isl_map *ptr); + friend inline map manage_copy(__isl_keep isl_map *ptr); + +protected: + isl_map *ptr = nullptr; + + inline explicit map(__isl_take isl_map *ptr); + +public: + inline /* implicit */ map(); + inline /* implicit */ map(const map &obj); + inline /* implicit */ map(isl::basic_map bmap); + inline explicit map(isl::ctx ctx, const std::string &str); + inline map &operator=(map obj); + inline ~map(); + inline __isl_give isl_map *copy() const &; + inline __isl_give isl_map *copy() && = delete; + inline __isl_keep isl_map *get() const; + inline __isl_give isl_map *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::map add_constraint(isl::constraint constraint) const; + inline isl::map add_dims(isl::dim type, unsigned int n) const; + inline isl::basic_map affine_hull() const; + inline isl::map align_params(isl::space model) const; + inline isl::map apply_domain(isl::map map2) const; + inline isl::union_map apply_domain(const isl::union_map &umap2) const; + inline isl::map apply_domain(const isl::basic_map &map2) const; + inline isl::map apply_range(isl::map map2) const; + inline isl::union_map apply_range(const isl::union_map &umap2) const; + inline isl::map apply_range(const isl::basic_map &map2) const; + inline isl::map as_map() const; + inline isl::multi_union_pw_aff as_multi_union_pw_aff() const; + inline isl::pw_multi_aff as_pw_multi_aff() const; + inline isl::union_pw_multi_aff as_union_pw_multi_aff() const; + inline isl::basic_map_list basic_map_list() const; + inline isl::basic_map_list get_basic_map_list() const; + inline isl::set bind_domain(isl::multi_id tuple) const; + inline isl::set bind_range(isl::multi_id tuple) const; + inline boolean can_curry() const; + inline isl::map coalesce() const; + inline isl::map complement() const; + inline isl::union_map compute_divs() const; + inline isl::map curry() const; + inline isl::set deltas() const; + inline isl::map detect_equalities() const; + inline class size dim(isl::dim type) const; + inline isl::pw_aff dim_max(int pos) const; + inline isl::pw_aff dim_min(int pos) const; + inline isl::set domain() const; + inline isl::map domain_factor_domain() const; + inline isl::map domain_factor_range() const; + inline isl::map domain_map() const; + inline isl::union_pw_multi_aff domain_map_union_pw_multi_aff() const; + inline isl::map domain_product(isl::map map2) const; + inline isl::union_map domain_product(const isl::union_map &umap2) const; + inline isl::map domain_product(const isl::basic_map &map2) const; + inline class size domain_tuple_dim() const; + inline isl::id domain_tuple_id() const; + inline isl::id get_domain_tuple_id() const; + static inline isl::map empty(isl::space space); + inline isl::map eq_at(isl::multi_pw_aff mpa) const; + inline isl::union_map eq_at(const isl::multi_union_pw_aff &mupa) const; + inline isl::map eq_at(const isl::aff &mpa) const; + inline isl::map eq_at(const isl::multi_aff &mpa) const; + inline isl::map eq_at(const isl::pw_aff &mpa) const; + inline isl::map eq_at(const isl::pw_multi_aff &mpa) const; + inline isl::map equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const; + inline boolean every_map(const std::function &test) const; + inline isl::map extract_map(const isl::space &space) const; + inline isl::map factor_domain() const; + inline isl::map factor_range() const; + inline isl::map fix_si(isl::dim type, unsigned int pos, int value) const; + inline isl::union_map fixed_power(const isl::val &exp) const; + inline isl::union_map fixed_power(long exp) const; + inline isl::map flat_range_product(isl::map map2) const; + inline isl::union_map flat_range_product(const isl::union_map &umap2) const; + inline isl::map flat_range_product(const isl::basic_map &map2) const; + inline isl::map flatten() const; + inline isl::map flatten_domain() const; + inline isl::map flatten_range() const; + inline isl::map floordiv_val(isl::val d) const; + inline isl::map floordiv_val(long d) const; + inline stat foreach_basic_map(const std::function &fn) const; + inline stat foreach_map(const std::function &fn) const; + static inline isl::map from_aff(isl::aff aff); + static inline isl::map from_domain(isl::set set); + static inline isl::map from_domain_and_range(isl::set domain, isl::set range); + static inline isl::map from_multi_aff(isl::multi_aff maff); + static inline isl::map from_pw_aff(isl::pw_aff pwaff); + static inline isl::map from_range(isl::set set); + static inline isl::map from_union_map(isl::union_map umap); + inline isl::map gist(isl::map context) const; + inline isl::union_map gist(const isl::union_map &context) const; + inline isl::map gist(const isl::basic_map &context) const; + inline isl::map gist_domain(isl::set context) const; + inline isl::union_map gist_domain(const isl::union_set &uset) const; + inline isl::map gist_domain(const isl::basic_set &context) const; + inline isl::map gist_domain(const isl::point &context) const; + inline isl::map gist_params(isl::set context) const; + inline isl::union_map gist_range(const isl::union_set &uset) const; + inline boolean has_domain_tuple_id() const; + inline boolean has_equal_space(const isl::map &map2) const; + inline boolean has_range_tuple_id() const; + inline boolean has_tuple_id(isl::dim type) const; + inline boolean has_tuple_name(isl::dim type) const; + static inline isl::map identity(isl::space space); + inline isl::map intersect(isl::map map2) const; + inline isl::union_map intersect(const isl::union_map &umap2) const; + inline isl::map intersect(const isl::basic_map &map2) const; + inline isl::map intersect_domain(isl::set set) const; + inline isl::union_map intersect_domain(const isl::space &space) const; + inline isl::union_map intersect_domain(const isl::union_set &uset) const; + inline isl::map intersect_domain(const isl::basic_set &set) const; + inline isl::map intersect_domain(const isl::point &set) const; + inline isl::map intersect_domain_factor_domain(isl::map factor) const; + inline isl::union_map intersect_domain_factor_domain(const isl::union_map &factor) const; + inline isl::map intersect_domain_factor_domain(const isl::basic_map &factor) const; + inline isl::map intersect_domain_factor_range(isl::map factor) const; + inline isl::union_map intersect_domain_factor_range(const isl::union_map &factor) const; + inline isl::map intersect_domain_factor_range(const isl::basic_map &factor) const; + inline isl::map intersect_params(isl::set params) const; + inline isl::map intersect_range(isl::set set) const; + inline isl::union_map intersect_range(const isl::space &space) const; + inline isl::union_map intersect_range(const isl::union_set &uset) const; + inline isl::map intersect_range(const isl::basic_set &set) const; + inline isl::map intersect_range(const isl::point &set) const; + inline isl::map intersect_range_factor_domain(isl::map factor) const; + inline isl::union_map intersect_range_factor_domain(const isl::union_map &factor) const; + inline isl::map intersect_range_factor_domain(const isl::basic_map &factor) const; + inline isl::map intersect_range_factor_range(isl::map factor) const; + inline isl::union_map intersect_range_factor_range(const isl::union_map &factor) const; + inline isl::map intersect_range_factor_range(const isl::basic_map &factor) const; + inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; + inline boolean is_bijective() const; + inline boolean is_disjoint(const isl::map &map2) const; + inline boolean is_disjoint(const isl::union_map &umap2) const; + inline boolean is_disjoint(const isl::basic_map &map2) const; + inline boolean is_empty() const; + inline boolean is_equal(const isl::map &map2) const; + inline boolean is_equal(const isl::union_map &umap2) const; + inline boolean is_equal(const isl::basic_map &map2) const; + inline boolean is_injective() const; + inline boolean is_single_valued() const; + inline boolean is_strict_subset(const isl::map &map2) const; + inline boolean is_strict_subset(const isl::union_map &umap2) const; + inline boolean is_strict_subset(const isl::basic_map &map2) const; + inline boolean is_subset(const isl::map &map2) const; + inline boolean is_subset(const isl::union_map &umap2) const; + inline boolean is_subset(const isl::basic_map &map2) const; + inline boolean isa_map() const; + static inline isl::map lex_ge(isl::space set_space); + inline isl::map lex_ge_at(isl::multi_pw_aff mpa) const; + static inline isl::map lex_gt(isl::space set_space); + inline isl::map lex_gt_at(isl::multi_pw_aff mpa) const; + static inline isl::map lex_le(isl::space set_space); + inline isl::map lex_le_at(isl::multi_pw_aff mpa) const; + static inline isl::map lex_lt(isl::space set_space); + inline isl::map lex_lt_at(isl::multi_pw_aff mpa) const; + inline isl::map lexmax() const; + inline isl::pw_multi_aff lexmax_pw_multi_aff() const; + inline isl::map lexmin() const; + inline isl::pw_multi_aff lexmin_pw_multi_aff() const; + inline isl::map lower_bound(isl::multi_pw_aff lower) const; + inline isl::map lower_bound_si(isl::dim type, unsigned int pos, int value) const; + inline isl::map_list map_list() const; + inline isl::multi_pw_aff max_multi_pw_aff() const; + inline isl::multi_pw_aff min_multi_pw_aff() const; + inline isl::map move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const; + inline class size n_basic_map() const; + inline isl::map order_lt(isl::dim type1, int pos1, isl::dim type2, int pos2) const; + inline isl::set params() const; + inline isl::basic_map polyhedral_hull() const; + inline isl::map preimage_domain(isl::multi_aff ma) const; + inline isl::map preimage_domain(isl::multi_pw_aff mpa) const; + inline isl::map preimage_domain(isl::pw_multi_aff pma) const; + inline isl::union_map preimage_domain(const isl::union_pw_multi_aff &upma) const; + inline isl::map preimage_range(isl::multi_aff ma) const; + inline isl::map preimage_range(isl::pw_multi_aff pma) const; + inline isl::union_map preimage_range(const isl::union_pw_multi_aff &upma) const; + inline isl::map product(isl::map map2) const; + inline isl::union_map product(const isl::union_map &umap2) const; + inline isl::map product(const isl::basic_map &map2) const; + inline isl::map project_out(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::map project_out_all_params() const; + inline isl::set range() const; + inline isl::map range_factor_domain() const; + inline isl::map range_factor_range() const; + inline isl::fixed_box range_lattice_tile() const; + inline isl::fixed_box get_range_lattice_tile() const; + inline isl::map range_map() const; + inline isl::map range_product(isl::map map2) const; + inline isl::union_map range_product(const isl::union_map &umap2) const; + inline isl::map range_product(const isl::basic_map &map2) const; + inline isl::map range_reverse() const; + inline isl::fixed_box range_simple_fixed_box_hull() const; + inline isl::fixed_box get_range_simple_fixed_box_hull() const; + inline class size range_tuple_dim() const; + inline isl::id range_tuple_id() const; + inline isl::id get_range_tuple_id() const; + inline isl::map reverse() const; + inline isl::basic_map sample() const; + inline isl::map set_domain_tuple(isl::id id) const; + inline isl::map set_domain_tuple(const std::string &id) const; + inline isl::map set_range_tuple(isl::id id) const; + inline isl::map set_range_tuple(const std::string &id) const; + inline isl::map set_tuple_id(isl::dim type, isl::id id) const; + inline isl::map set_tuple_id(isl::dim type, const std::string &id) const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::map subtract(isl::map map2) const; + inline isl::union_map subtract(const isl::union_map &umap2) const; + inline isl::map subtract(const isl::basic_map &map2) const; + inline isl::union_map subtract_domain(const isl::union_set &dom) const; + inline isl::union_map subtract_range(const isl::union_set &dom) const; + inline isl::map sum(isl::map map2) const; + inline isl::map_list to_list() const; + inline isl::union_map to_union_map() const; + inline isl::id tuple_id(isl::dim type) const; + inline isl::id get_tuple_id(isl::dim type) const; + inline isl::map uncurry() const; + inline isl::map unite(isl::map map2) const; + inline isl::union_map unite(const isl::union_map &umap2) const; + inline isl::map unite(const isl::basic_map &map2) const; + static inline isl::map universe(isl::space space); + inline isl::basic_map unshifted_simple_hull() const; + inline isl::map upper_bound(isl::multi_pw_aff upper) const; + inline isl::map upper_bound_si(isl::dim type, unsigned int pos, int value) const; + inline isl::set wrap() const; + inline isl::map zip() const; +}; + +// declarations for isl::map_list +inline map_list manage(__isl_take isl_map_list *ptr); +inline map_list manage_copy(__isl_keep isl_map_list *ptr); + +class map_list { + friend inline map_list manage(__isl_take isl_map_list *ptr); + friend inline map_list manage_copy(__isl_keep isl_map_list *ptr); + +protected: + isl_map_list *ptr = nullptr; + + inline explicit map_list(__isl_take isl_map_list *ptr); + +public: + inline /* implicit */ map_list(); + inline /* implicit */ map_list(const map_list &obj); + inline explicit map_list(isl::ctx ctx, int n); + inline explicit map_list(isl::map el); + inline explicit map_list(isl::ctx ctx, const std::string &str); + inline map_list &operator=(map_list obj); + inline ~map_list(); + inline __isl_give isl_map_list *copy() const &; + inline __isl_give isl_map_list *copy() && = delete; + inline __isl_keep isl_map_list *get() const; + inline __isl_give isl_map_list *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::map_list add(isl::map el) const; + inline isl::map at(int index) const; + inline isl::map get_at(int index) const; + inline isl::map_list clear() const; + inline isl::map_list concat(isl::map_list list2) const; + inline isl::map_list drop(unsigned int first, unsigned int n) const; + inline stat foreach(const std::function &fn) const; + inline isl::map_list insert(unsigned int pos, isl::map el) const; + inline class size size() const; +}; + +// declarations for isl::multi_aff +inline multi_aff manage(__isl_take isl_multi_aff *ptr); +inline multi_aff manage_copy(__isl_keep isl_multi_aff *ptr); + +class multi_aff { + friend inline multi_aff manage(__isl_take isl_multi_aff *ptr); + friend inline multi_aff manage_copy(__isl_keep isl_multi_aff *ptr); + +protected: + isl_multi_aff *ptr = nullptr; + + inline explicit multi_aff(__isl_take isl_multi_aff *ptr); + +public: + inline /* implicit */ multi_aff(); + inline /* implicit */ multi_aff(const multi_aff &obj); + inline /* implicit */ multi_aff(isl::aff aff); + inline explicit multi_aff(isl::space space, isl::aff_list list); + inline explicit multi_aff(isl::ctx ctx, const std::string &str); + inline multi_aff &operator=(multi_aff obj); + inline ~multi_aff(); + inline __isl_give isl_multi_aff *copy() const &; + inline __isl_give isl_multi_aff *copy() && = delete; + inline __isl_keep isl_multi_aff *get() const; + inline __isl_give isl_multi_aff *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::multi_aff add(isl::multi_aff multi2) const; + inline isl::multi_pw_aff add(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff add(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff add(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff add(const isl::union_pw_multi_aff &upma2) const; + inline isl::multi_aff add(const isl::aff &multi2) const; + inline isl::multi_aff add_constant(isl::multi_val mv) const; + inline isl::multi_aff add_constant(isl::val v) const; + inline isl::multi_aff add_constant(long v) const; + inline isl::union_pw_multi_aff add_pw_multi_aff(const isl::pw_multi_aff &pma) const; + inline isl::union_pw_multi_aff apply(const isl::union_pw_multi_aff &upma2) const; + inline isl::map as_map() const; + inline isl::multi_aff as_multi_aff() const; + inline isl::multi_union_pw_aff as_multi_union_pw_aff() const; + inline isl::pw_multi_aff as_pw_multi_aff() const; + inline isl::set as_set() const; + inline isl::union_map as_union_map() const; + inline isl::aff at(int pos) const; + inline isl::aff get_at(int pos) const; + inline isl::basic_set bind(isl::multi_id tuple) const; + inline isl::multi_aff bind_domain(isl::multi_id tuple) const; + inline isl::multi_aff bind_domain_wrapped_domain(isl::multi_id tuple) const; + inline isl::pw_multi_aff coalesce() const; + inline isl::multi_val constant_multi_val() const; + inline isl::multi_val get_constant_multi_val() const; + inline class size dim(isl::dim type) const; + inline isl::set domain() const; + static inline isl::multi_aff domain_map(isl::space space); + inline isl::pw_multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::pw_multi_aff extract_pw_multi_aff(const isl::space &space) const; + inline isl::multi_aff flat_range_product(isl::multi_aff multi2) const; + inline isl::multi_pw_aff flat_range_product(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff flat_range_product(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff flat_range_product(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff flat_range_product(const isl::union_pw_multi_aff &upma2) const; + inline isl::multi_aff flat_range_product(const isl::aff &multi2) const; + inline isl::multi_aff floor() const; + inline stat foreach_piece(const std::function &fn) const; + inline isl::multi_aff gist(isl::set context) const; + inline isl::union_pw_multi_aff gist(const isl::union_set &context) const; + inline isl::multi_aff gist(const isl::basic_set &context) const; + inline isl::multi_aff gist(const isl::point &context) const; + inline boolean has_range_tuple_id() const; + static inline isl::multi_aff identity(isl::space space); + inline isl::multi_aff identity() const; + static inline isl::multi_aff identity_on_domain(isl::space space); + inline isl::multi_aff insert_domain(isl::space domain) const; + inline isl::pw_multi_aff intersect_domain(const isl::set &set) const; + inline isl::union_pw_multi_aff intersect_domain(const isl::space &space) const; + inline isl::union_pw_multi_aff intersect_domain(const isl::union_set &uset) const; + inline isl::union_pw_multi_aff intersect_domain_wrapped_domain(const isl::union_set &uset) const; + inline isl::union_pw_multi_aff intersect_domain_wrapped_range(const isl::union_set &uset) const; + inline isl::pw_multi_aff intersect_params(const isl::set &set) const; + inline boolean involves_locals() const; + inline boolean involves_nan() const; + inline boolean involves_param(const isl::id &id) const; + inline boolean involves_param(const std::string &id) const; + inline boolean involves_param(const isl::id_list &list) const; + inline boolean isa_multi_aff() const; + inline boolean isa_pw_multi_aff() const; + inline isl::aff_list list() const; + inline isl::aff_list get_list() const; + inline isl::multi_pw_aff max(const isl::multi_pw_aff &multi2) const; + inline isl::multi_val max_multi_val() const; + inline isl::multi_pw_aff min(const isl::multi_pw_aff &multi2) const; + inline isl::multi_val min_multi_val() const; + static inline isl::multi_aff multi_val_on_domain(isl::space space, isl::multi_val mv); + inline class size n_piece() const; + inline isl::multi_aff neg() const; + inline boolean plain_is_empty() const; + inline boolean plain_is_equal(const isl::multi_aff &multi2) const; + inline boolean plain_is_equal(const isl::multi_pw_aff &multi2) const; + inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const; + inline boolean plain_is_equal(const isl::aff &multi2) const; + inline isl::pw_multi_aff preimage_domain_wrapped_domain(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const; + inline isl::multi_aff product(isl::multi_aff multi2) const; + inline isl::multi_pw_aff product(const isl::multi_pw_aff &multi2) const; + inline isl::pw_multi_aff product(const isl::pw_multi_aff &pma2) const; + inline isl::multi_aff product(const isl::aff &multi2) const; + inline isl::multi_aff pullback(isl::multi_aff ma2) const; + inline isl::multi_pw_aff pullback(const isl::multi_pw_aff &mpa2) const; + inline isl::pw_multi_aff pullback(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff pullback(const isl::union_pw_multi_aff &upma2) const; + inline isl::multi_aff pullback(const isl::aff &ma2) const; + inline isl::pw_multi_aff_list pw_multi_aff_list() const; + inline isl::pw_multi_aff range_factor_domain() const; + inline isl::pw_multi_aff range_factor_range() const; + static inline isl::multi_aff range_map(isl::space space); + inline isl::multi_aff range_product(isl::multi_aff multi2) const; + inline isl::multi_pw_aff range_product(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff range_product(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff range_product(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff range_product(const isl::union_pw_multi_aff &upma2) const; + inline isl::multi_aff range_product(const isl::aff &multi2) const; + inline isl::id range_tuple_id() const; + inline isl::id get_range_tuple_id() const; + inline isl::multi_aff reset_range_tuple_id() const; + inline isl::multi_aff reset_tuple_id(isl::dim type) const; + inline isl::multi_aff scale(isl::multi_val mv) const; + inline isl::multi_aff scale(isl::val v) const; + inline isl::multi_aff scale(long v) const; + inline isl::multi_aff scale_down(isl::multi_val mv) const; + inline isl::multi_aff scale_down(isl::val v) const; + inline isl::multi_aff scale_down(long v) const; + inline isl::multi_aff set_aff(int pos, isl::aff el) const; + inline isl::multi_aff set_at(int pos, isl::aff el) const; + inline isl::multi_pw_aff set_at(int pos, const isl::pw_aff &el) const; + inline isl::multi_union_pw_aff set_at(int pos, const isl::union_pw_aff &el) const; + inline isl::multi_pw_aff set_pw_aff(int pos, const isl::pw_aff &el) const; + inline isl::pw_multi_aff set_pw_aff(unsigned int pos, const isl::pw_aff &pa) const; + inline isl::multi_aff set_range_tuple(isl::id id) const; + inline isl::multi_aff set_range_tuple(const std::string &id) const; + inline isl::multi_union_pw_aff set_union_pw_aff(int pos, const isl::union_pw_aff &el) const; + inline class size size() const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::multi_aff sub(isl::multi_aff multi2) const; + inline isl::multi_pw_aff sub(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff sub(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff sub(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff sub(const isl::union_pw_multi_aff &upma2) const; + inline isl::multi_aff sub(const isl::aff &multi2) const; + inline isl::pw_multi_aff subtract_domain(const isl::set &set) const; + inline isl::union_pw_multi_aff subtract_domain(const isl::space &space) const; + inline isl::union_pw_multi_aff subtract_domain(const isl::union_set &uset) const; + inline isl::pw_multi_aff_list to_list() const; + inline isl::multi_pw_aff to_multi_pw_aff() const; + inline isl::multi_union_pw_aff to_multi_union_pw_aff() const; + inline isl::pw_multi_aff to_pw_multi_aff() const; + inline isl::union_pw_multi_aff to_union_pw_multi_aff() const; + inline isl::id tuple_id(isl::dim type) const; + inline isl::multi_aff unbind_params_insert_domain(isl::multi_id domain) const; + inline isl::multi_pw_aff union_add(const isl::multi_pw_aff &mpa2) const; + inline isl::multi_union_pw_aff union_add(const isl::multi_union_pw_aff &mupa2) const; + inline isl::pw_multi_aff union_add(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff union_add(const isl::union_pw_multi_aff &upma2) const; + static inline isl::multi_aff zero(isl::space space); +}; + +// declarations for isl::multi_id +inline multi_id manage(__isl_take isl_multi_id *ptr); +inline multi_id manage_copy(__isl_keep isl_multi_id *ptr); + +class multi_id { + friend inline multi_id manage(__isl_take isl_multi_id *ptr); + friend inline multi_id manage_copy(__isl_keep isl_multi_id *ptr); + +protected: + isl_multi_id *ptr = nullptr; + + inline explicit multi_id(__isl_take isl_multi_id *ptr); + +public: + inline /* implicit */ multi_id(); + inline /* implicit */ multi_id(const multi_id &obj); + inline explicit multi_id(isl::space space, isl::id_list list); + inline explicit multi_id(isl::ctx ctx, const std::string &str); + inline multi_id &operator=(multi_id obj); + inline ~multi_id(); + inline __isl_give isl_multi_id *copy() const &; + inline __isl_give isl_multi_id *copy() && = delete; + inline __isl_keep isl_multi_id *get() const; + inline __isl_give isl_multi_id *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::id at(int pos) const; + inline isl::id get_at(int pos) const; + inline isl::multi_id flat_range_product(isl::multi_id multi2) const; + inline isl::id_list list() const; + inline isl::id_list get_list() const; + inline boolean plain_is_equal(const isl::multi_id &multi2) const; + inline isl::multi_id range_product(isl::multi_id multi2) const; + inline isl::multi_id set_at(int pos, isl::id el) const; + inline isl::multi_id set_at(int pos, const std::string &el) const; + inline isl::multi_id set_id(int pos, isl::id el) const; + inline isl::multi_id set_id(int pos, const std::string &el) const; + inline class size size() const; + inline isl::space space() const; + inline isl::space get_space() const; +}; + +// declarations for isl::multi_pw_aff +inline multi_pw_aff manage(__isl_take isl_multi_pw_aff *ptr); +inline multi_pw_aff manage_copy(__isl_keep isl_multi_pw_aff *ptr); + +class multi_pw_aff { + friend inline multi_pw_aff manage(__isl_take isl_multi_pw_aff *ptr); + friend inline multi_pw_aff manage_copy(__isl_keep isl_multi_pw_aff *ptr); + +protected: + isl_multi_pw_aff *ptr = nullptr; + + inline explicit multi_pw_aff(__isl_take isl_multi_pw_aff *ptr); + +public: + inline /* implicit */ multi_pw_aff(); + inline /* implicit */ multi_pw_aff(const multi_pw_aff &obj); + inline /* implicit */ multi_pw_aff(isl::aff aff); + inline /* implicit */ multi_pw_aff(isl::multi_aff ma); + inline /* implicit */ multi_pw_aff(isl::pw_aff pa); + inline explicit multi_pw_aff(isl::space space, isl::pw_aff_list list); + inline /* implicit */ multi_pw_aff(isl::pw_multi_aff pma); + inline explicit multi_pw_aff(isl::ctx ctx, const std::string &str); + inline multi_pw_aff &operator=(multi_pw_aff obj); + inline ~multi_pw_aff(); + inline __isl_give isl_multi_pw_aff *copy() const &; + inline __isl_give isl_multi_pw_aff *copy() && = delete; + inline __isl_keep isl_multi_pw_aff *get() const; + inline __isl_give isl_multi_pw_aff *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::multi_pw_aff add(isl::multi_pw_aff multi2) const; + inline isl::multi_union_pw_aff add(const isl::multi_union_pw_aff &multi2) const; + inline isl::multi_pw_aff add(const isl::aff &multi2) const; + inline isl::multi_pw_aff add(const isl::multi_aff &multi2) const; + inline isl::multi_pw_aff add(const isl::pw_aff &multi2) const; + inline isl::multi_pw_aff add(const isl::pw_multi_aff &multi2) const; + inline isl::multi_pw_aff add_constant(isl::multi_val mv) const; + inline isl::multi_pw_aff add_constant(isl::val v) const; + inline isl::multi_pw_aff add_constant(long v) const; + inline isl::map as_map() const; + inline isl::multi_aff as_multi_aff() const; + inline isl::set as_set() const; + inline isl::pw_aff at(int pos) const; + inline isl::pw_aff get_at(int pos) const; + inline isl::set bind(isl::multi_id tuple) const; + inline isl::multi_pw_aff bind_domain(isl::multi_id tuple) const; + inline isl::multi_pw_aff bind_domain_wrapped_domain(isl::multi_id tuple) const; + inline isl::multi_pw_aff coalesce() const; + inline class size dim(isl::dim type) const; + inline isl::set domain() const; + inline isl::multi_pw_aff flat_range_product(isl::multi_pw_aff multi2) const; + inline isl::multi_union_pw_aff flat_range_product(const isl::multi_union_pw_aff &multi2) const; + inline isl::multi_pw_aff flat_range_product(const isl::aff &multi2) const; + inline isl::multi_pw_aff flat_range_product(const isl::multi_aff &multi2) const; + inline isl::multi_pw_aff flat_range_product(const isl::pw_aff &multi2) const; + inline isl::multi_pw_aff flat_range_product(const isl::pw_multi_aff &multi2) const; + inline isl::multi_pw_aff gist(isl::set set) const; + inline isl::multi_union_pw_aff gist(const isl::union_set &context) const; + inline isl::multi_pw_aff gist(const isl::basic_set &set) const; + inline isl::multi_pw_aff gist(const isl::point &set) const; + inline boolean has_range_tuple_id() const; + static inline isl::multi_pw_aff identity(isl::space space); + inline isl::multi_pw_aff identity() const; + static inline isl::multi_pw_aff identity_on_domain(isl::space space); + inline isl::multi_pw_aff insert_domain(isl::space domain) const; + inline isl::multi_pw_aff intersect_domain(isl::set domain) const; + inline isl::multi_union_pw_aff intersect_domain(const isl::union_set &uset) const; + inline isl::multi_pw_aff intersect_domain(const isl::basic_set &domain) const; + inline isl::multi_pw_aff intersect_domain(const isl::point &domain) const; + inline isl::multi_pw_aff intersect_params(isl::set set) const; + inline boolean involves_nan() const; + inline boolean involves_param(const isl::id &id) const; + inline boolean involves_param(const std::string &id) const; + inline boolean involves_param(const isl::id_list &list) const; + inline boolean isa_multi_aff() const; + inline isl::pw_aff_list list() const; + inline isl::pw_aff_list get_list() const; + inline isl::multi_pw_aff max(isl::multi_pw_aff multi2) const; + inline isl::multi_val max_multi_val() const; + inline isl::multi_pw_aff min(isl::multi_pw_aff multi2) const; + inline isl::multi_val min_multi_val() const; + inline isl::multi_pw_aff neg() const; + inline boolean plain_is_equal(const isl::multi_pw_aff &multi2) const; + inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const; + inline boolean plain_is_equal(const isl::aff &multi2) const; + inline boolean plain_is_equal(const isl::multi_aff &multi2) const; + inline boolean plain_is_equal(const isl::pw_aff &multi2) const; + inline boolean plain_is_equal(const isl::pw_multi_aff &multi2) const; + inline isl::multi_pw_aff product(isl::multi_pw_aff multi2) const; + inline isl::multi_pw_aff pullback(isl::multi_aff ma) const; + inline isl::multi_pw_aff pullback(isl::multi_pw_aff mpa2) const; + inline isl::multi_pw_aff pullback(isl::pw_multi_aff pma) const; + inline isl::multi_union_pw_aff pullback(const isl::union_pw_multi_aff &upma) const; + inline isl::multi_pw_aff range_product(isl::multi_pw_aff multi2) const; + inline isl::multi_union_pw_aff range_product(const isl::multi_union_pw_aff &multi2) const; + inline isl::multi_pw_aff range_product(const isl::aff &multi2) const; + inline isl::multi_pw_aff range_product(const isl::multi_aff &multi2) const; + inline isl::multi_pw_aff range_product(const isl::pw_aff &multi2) const; + inline isl::multi_pw_aff range_product(const isl::pw_multi_aff &multi2) const; + inline isl::id range_tuple_id() const; + inline isl::id get_range_tuple_id() const; + inline isl::multi_pw_aff reset_range_tuple_id() const; + inline isl::multi_pw_aff reset_tuple_id(isl::dim type) const; + inline isl::multi_pw_aff scale(isl::multi_val mv) const; + inline isl::multi_pw_aff scale(isl::val v) const; + inline isl::multi_pw_aff scale(long v) const; + inline isl::multi_pw_aff scale_down(isl::multi_val mv) const; + inline isl::multi_pw_aff scale_down(isl::val v) const; + inline isl::multi_pw_aff scale_down(long v) const; + inline isl::multi_pw_aff set_at(int pos, isl::pw_aff el) const; + inline isl::multi_union_pw_aff set_at(int pos, const isl::union_pw_aff &el) const; + inline isl::multi_pw_aff set_pw_aff(int pos, isl::pw_aff el) const; + inline isl::multi_pw_aff set_range_tuple(isl::id id) const; + inline isl::multi_pw_aff set_range_tuple(const std::string &id) const; + inline isl::multi_union_pw_aff set_union_pw_aff(int pos, const isl::union_pw_aff &el) const; + inline class size size() const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::multi_pw_aff sub(isl::multi_pw_aff multi2) const; + inline isl::multi_union_pw_aff sub(const isl::multi_union_pw_aff &multi2) const; + inline isl::multi_pw_aff sub(const isl::aff &multi2) const; + inline isl::multi_pw_aff sub(const isl::multi_aff &multi2) const; + inline isl::multi_pw_aff sub(const isl::pw_aff &multi2) const; + inline isl::multi_pw_aff sub(const isl::pw_multi_aff &multi2) const; + inline isl::multi_pw_aff unbind_params_insert_domain(isl::multi_id domain) const; + inline isl::multi_pw_aff union_add(isl::multi_pw_aff mpa2) const; + inline isl::multi_union_pw_aff union_add(const isl::multi_union_pw_aff &mupa2) const; + inline isl::multi_pw_aff union_add(const isl::aff &mpa2) const; + inline isl::multi_pw_aff union_add(const isl::multi_aff &mpa2) const; + inline isl::multi_pw_aff union_add(const isl::pw_aff &mpa2) const; + inline isl::multi_pw_aff union_add(const isl::pw_multi_aff &mpa2) const; + static inline isl::multi_pw_aff zero(isl::space space); +}; + +// declarations for isl::multi_union_pw_aff +inline multi_union_pw_aff manage(__isl_take isl_multi_union_pw_aff *ptr); +inline multi_union_pw_aff manage_copy(__isl_keep isl_multi_union_pw_aff *ptr); + +class multi_union_pw_aff { + friend inline multi_union_pw_aff manage(__isl_take isl_multi_union_pw_aff *ptr); + friend inline multi_union_pw_aff manage_copy(__isl_keep isl_multi_union_pw_aff *ptr); + +protected: + isl_multi_union_pw_aff *ptr = nullptr; + + inline explicit multi_union_pw_aff(__isl_take isl_multi_union_pw_aff *ptr); + +public: + inline /* implicit */ multi_union_pw_aff(); + inline /* implicit */ multi_union_pw_aff(const multi_union_pw_aff &obj); + inline /* implicit */ multi_union_pw_aff(isl::multi_pw_aff mpa); + inline /* implicit */ multi_union_pw_aff(isl::union_pw_aff upa); + inline explicit multi_union_pw_aff(isl::space space, isl::union_pw_aff_list list); + inline explicit multi_union_pw_aff(isl::union_pw_multi_aff upma); + inline explicit multi_union_pw_aff(isl::ctx ctx, const std::string &str); + inline multi_union_pw_aff &operator=(multi_union_pw_aff obj); + inline ~multi_union_pw_aff(); + inline __isl_give isl_multi_union_pw_aff *copy() const &; + inline __isl_give isl_multi_union_pw_aff *copy() && = delete; + inline __isl_keep isl_multi_union_pw_aff *get() const; + inline __isl_give isl_multi_union_pw_aff *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::multi_union_pw_aff add(isl::multi_union_pw_aff multi2) const; + inline isl::union_pw_aff at(int pos) const; + inline isl::union_pw_aff get_at(int pos) const; + inline isl::union_set bind(isl::multi_id tuple) const; + inline isl::multi_union_pw_aff coalesce() const; + inline class size dim(isl::dim type) const; + inline isl::union_set domain() const; + inline isl::multi_union_pw_aff flat_range_product(isl::multi_union_pw_aff multi2) const; + static inline isl::multi_union_pw_aff from_union_map(isl::union_map umap); + inline isl::multi_union_pw_aff gist(isl::union_set context) const; + inline boolean has_range_tuple_id() const; + inline isl::multi_union_pw_aff intersect_domain(isl::union_set uset) const; + inline isl::multi_union_pw_aff intersect_params(isl::set params) const; + inline boolean involves_nan() const; + inline isl::union_pw_aff_list list() const; + inline isl::union_pw_aff_list get_list() const; + inline isl::multi_union_pw_aff neg() const; + inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const; + inline isl::multi_union_pw_aff pullback(isl::union_pw_multi_aff upma) const; + inline isl::multi_union_pw_aff range_product(isl::multi_union_pw_aff multi2) const; + inline isl::id range_tuple_id() const; + inline isl::id get_range_tuple_id() const; + inline isl::multi_union_pw_aff reset_range_tuple_id() const; + inline isl::multi_union_pw_aff reset_tuple_id(isl::dim type) const; + inline isl::multi_union_pw_aff scale(isl::multi_val mv) const; + inline isl::multi_union_pw_aff scale(isl::val v) const; + inline isl::multi_union_pw_aff scale(long v) const; + inline isl::multi_union_pw_aff scale_down(isl::multi_val mv) const; + inline isl::multi_union_pw_aff scale_down(isl::val v) const; + inline isl::multi_union_pw_aff scale_down(long v) const; + inline isl::multi_union_pw_aff set_at(int pos, isl::union_pw_aff el) const; + inline isl::multi_union_pw_aff set_range_tuple(isl::id id) const; + inline isl::multi_union_pw_aff set_range_tuple(const std::string &id) const; + inline isl::multi_union_pw_aff set_union_pw_aff(int pos, isl::union_pw_aff el) const; + inline class size size() const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::multi_union_pw_aff sub(isl::multi_union_pw_aff multi2) const; + inline isl::multi_union_pw_aff union_add(isl::multi_union_pw_aff mupa2) const; + static inline isl::multi_union_pw_aff zero(isl::space space); +}; + +// declarations for isl::multi_val +inline multi_val manage(__isl_take isl_multi_val *ptr); +inline multi_val manage_copy(__isl_keep isl_multi_val *ptr); + +class multi_val { + friend inline multi_val manage(__isl_take isl_multi_val *ptr); + friend inline multi_val manage_copy(__isl_keep isl_multi_val *ptr); + +protected: + isl_multi_val *ptr = nullptr; + + inline explicit multi_val(__isl_take isl_multi_val *ptr); + +public: + inline /* implicit */ multi_val(); + inline /* implicit */ multi_val(const multi_val &obj); + inline explicit multi_val(isl::space space, isl::val_list list); + inline explicit multi_val(isl::ctx ctx, const std::string &str); + inline multi_val &operator=(multi_val obj); + inline ~multi_val(); + inline __isl_give isl_multi_val *copy() const &; + inline __isl_give isl_multi_val *copy() && = delete; + inline __isl_keep isl_multi_val *get() const; + inline __isl_give isl_multi_val *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::multi_val add(isl::multi_val multi2) const; + inline isl::multi_val add(isl::val v) const; + inline isl::multi_val add(long v) const; + inline isl::val at(int pos) const; + inline isl::val get_at(int pos) const; + inline class size dim(isl::dim type) const; + inline isl::multi_val flat_range_product(isl::multi_val multi2) const; + inline boolean has_range_tuple_id() const; + inline boolean involves_nan() const; + inline isl::val_list list() const; + inline isl::val_list get_list() const; + inline isl::multi_val max(isl::multi_val multi2) const; + inline isl::multi_val min(isl::multi_val multi2) const; + inline isl::multi_val neg() const; + inline boolean plain_is_equal(const isl::multi_val &multi2) const; + inline isl::multi_val product(isl::multi_val multi2) const; + inline isl::multi_val range_product(isl::multi_val multi2) const; + inline isl::id range_tuple_id() const; + inline isl::id get_range_tuple_id() const; + inline isl::multi_val reset_range_tuple_id() const; + inline isl::multi_val reset_tuple_id(isl::dim type) const; + inline isl::multi_val scale(isl::multi_val mv) const; + inline isl::multi_val scale(isl::val v) const; + inline isl::multi_val scale(long v) const; + inline isl::multi_val scale_down(isl::multi_val mv) const; + inline isl::multi_val scale_down(isl::val v) const; + inline isl::multi_val scale_down(long v) const; + inline isl::multi_val set_at(int pos, isl::val el) const; + inline isl::multi_val set_at(int pos, long el) const; + inline isl::multi_val set_range_tuple(isl::id id) const; + inline isl::multi_val set_range_tuple(const std::string &id) const; + inline isl::multi_val set_val(int pos, isl::val el) const; + inline isl::multi_val set_val(int pos, long el) const; + inline class size size() const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::multi_val sub(isl::multi_val multi2) const; + static inline isl::multi_val zero(isl::space space); +}; + +// declarations for isl::point +inline point manage(__isl_take isl_point *ptr); +inline point manage_copy(__isl_keep isl_point *ptr); + +class point { + friend inline point manage(__isl_take isl_point *ptr); + friend inline point manage_copy(__isl_keep isl_point *ptr); + +protected: + isl_point *ptr = nullptr; + + inline explicit point(__isl_take isl_point *ptr); + +public: + inline /* implicit */ point(); + inline /* implicit */ point(const point &obj); + inline explicit point(isl::space space); + inline point &operator=(point obj); + inline ~point(); + inline __isl_give isl_point *copy() const &; + inline __isl_give isl_point *copy() && = delete; + inline __isl_keep isl_point *get() const; + inline __isl_give isl_point *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::set add_constraint(const isl::constraint &constraint) const; + inline isl::set add_dims(isl::dim type, unsigned int n) const; + inline isl::basic_set affine_hull() const; + inline isl::set align_params(const isl::space &model) const; + inline isl::basic_set apply(const isl::basic_map &bmap) const; + inline isl::set apply(const isl::map &map) const; + inline isl::union_set apply(const isl::union_map &umap) const; + inline isl::pw_multi_aff as_pw_multi_aff() const; + inline isl::set as_set() const; + inline isl::basic_set_list basic_set_list() const; + inline isl::set bind(const isl::multi_id &tuple) const; + inline isl::set coalesce() const; + inline isl::set complement() const; + inline isl::union_set compute_divs() const; + inline boolean contains(const isl::space &space) const; + inline isl::basic_set convex_hull() const; + inline isl::val coordinate_val(isl::dim type, int pos) const; + inline isl::val get_coordinate_val(isl::dim type, int pos) const; + inline isl::basic_set detect_equalities() const; + inline class size dim(isl::dim type) const; + inline boolean dim_has_any_lower_bound(isl::dim type, unsigned int pos) const; + inline isl::id dim_id(isl::dim type, unsigned int pos) const; + inline isl::pw_aff dim_max(int pos) const; + inline isl::val dim_max_val(int pos) const; + inline isl::pw_aff dim_min(int pos) const; + inline isl::val dim_min_val(int pos) const; + inline std::string dim_name(isl::dim type, unsigned int pos) const; + inline isl::aff div(int pos) const; + inline isl::set drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::set eliminate(isl::dim type, unsigned int first, unsigned int n) const; + inline boolean every_set(const std::function &test) const; + inline isl::set extract_set(const isl::space &space) const; + inline int find_dim_by_id(isl::dim type, const isl::id &id) const; + inline int find_dim_by_id(isl::dim type, const std::string &id) const; + inline isl::basic_set fix_si(isl::dim type, unsigned int pos, int value) const; + inline isl::basic_set fix_val(isl::dim type, unsigned int pos, const isl::val &v) const; + inline isl::basic_set fix_val(isl::dim type, unsigned int pos, long v) const; + inline isl::basic_set flatten() const; + inline stat foreach_basic_set(const std::function &fn) const; + inline stat foreach_point(const std::function &fn) const; + inline stat foreach_set(const std::function &fn) const; + inline isl::basic_set gist(const isl::basic_set &context) const; + inline isl::set gist(const isl::set &context) const; + inline isl::union_set gist(const isl::union_set &context) const; + inline isl::set gist_params(const isl::set &context) const; + inline boolean has_equal_space(const isl::set &set2) const; + inline isl::map identity() const; + inline isl::union_pw_multi_aff identity_union_pw_multi_aff() const; + inline isl::pw_aff indicator_function() const; + inline isl::set insert_dims(isl::dim type, unsigned int pos, unsigned int n) const; + inline isl::map insert_domain(const isl::space &domain) const; + inline isl::basic_set intersect(const isl::basic_set &bset2) const; + inline isl::set intersect(const isl::set &set2) const; + inline isl::union_set intersect(const isl::union_set &uset2) const; + inline isl::basic_set intersect_params(const isl::basic_set &bset2) const; + inline isl::set intersect_params(const isl::set ¶ms) const; + inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; + inline boolean involves_locals() const; + inline boolean is_bounded() const; + inline boolean is_disjoint(const isl::set &set2) const; + inline boolean is_disjoint(const isl::union_set &uset2) const; + inline boolean is_empty() const; + inline boolean is_equal(const isl::basic_set &bset2) const; + inline boolean is_equal(const isl::set &set2) const; + inline boolean is_equal(const isl::union_set &uset2) const; + inline boolean is_params() const; + inline boolean is_singleton() const; + inline boolean is_strict_subset(const isl::set &set2) const; + inline boolean is_strict_subset(const isl::union_set &uset2) const; + inline boolean is_subset(const isl::basic_set &bset2) const; + inline boolean is_subset(const isl::set &set2) const; + inline boolean is_subset(const isl::union_set &uset2) const; + inline boolean is_wrapping() const; + inline boolean isa_set() const; + inline isl::set lexmax() const; + inline isl::pw_multi_aff lexmax_pw_multi_aff() const; + inline isl::set lexmin() const; + inline isl::pw_multi_aff lexmin_pw_multi_aff() const; + inline isl::set lower_bound(const isl::multi_pw_aff &lower) const; + inline isl::set lower_bound(const isl::multi_val &lower) const; + inline isl::set lower_bound_si(isl::dim type, unsigned int pos, int value) const; + inline isl::set lower_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const; + inline isl::set lower_bound_val(isl::dim type, unsigned int pos, long value) const; + inline isl::multi_pw_aff max_multi_pw_aff() const; + inline isl::val max_val(const isl::aff &obj) const; + inline isl::multi_pw_aff min_multi_pw_aff() const; + inline isl::val min_val(const isl::aff &obj) const; + inline isl::multi_val multi_val() const; + inline isl::multi_val get_multi_val() const; + inline class size n_basic_set() const; + inline isl::basic_set params() const; + inline isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const; + inline isl::multi_val plain_multi_val_if_fixed() const; + inline isl::basic_set polyhedral_hull() const; + inline isl::set preimage(const isl::multi_aff &ma) const; + inline isl::set preimage(const isl::multi_pw_aff &mpa) const; + inline isl::set preimage(const isl::pw_multi_aff &pma) const; + inline isl::union_set preimage(const isl::union_pw_multi_aff &upma) const; + inline isl::set product(const isl::set &set2) const; + inline isl::basic_set project_out(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::set project_out_all_params() const; + inline isl::set project_out_param(const isl::id &id) const; + inline isl::set project_out_param(const std::string &id) const; + inline isl::set project_out_param(const isl::id_list &list) const; + inline isl::pw_multi_aff pw_multi_aff_on_domain(const isl::multi_val &mv) const; + inline isl::set remove_dims(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::set remove_divs() const; + inline isl::set remove_redundancies() const; + inline isl::set reset_tuple_id() const; + inline isl::basic_set sample() const; + inline isl::point sample_point() const; + inline isl::set set_dim_id(isl::dim type, unsigned int pos, const isl::id &id) const; + inline isl::set set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const; + inline isl::set_list set_list() const; + inline isl::set set_tuple_id(const isl::id &id) const; + inline isl::set set_tuple_id(const std::string &id) const; + inline isl::fixed_box simple_fixed_box_hull() const; + inline isl::basic_set simple_hull() const; + inline isl::space space() const; + inline isl::val stride(int pos) const; + inline isl::set subtract(const isl::set &set2) const; + inline isl::union_set subtract(const isl::union_set &uset2) const; + inline isl::basic_set_list to_list() const; + inline isl::set to_set() const; + inline isl::union_set to_union_set() const; + inline isl::map translation() const; + inline class size tuple_dim() const; + inline isl::id tuple_id() const; + inline std::string tuple_name() const; + inline isl::set unbind_params(const isl::multi_id &tuple) const; + inline isl::map unbind_params_insert_domain(const isl::multi_id &domain) const; + inline isl::set unite(const isl::basic_set &bset2) const; + inline isl::set unite(const isl::set &set2) const; + inline isl::union_set unite(const isl::union_set &uset2) const; + inline isl::basic_set unshifted_simple_hull() const; + inline isl::map unwrap() const; + inline isl::set upper_bound(const isl::multi_pw_aff &upper) const; + inline isl::set upper_bound(const isl::multi_val &upper) const; + inline isl::set upper_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const; + inline isl::set upper_bound_val(isl::dim type, unsigned int pos, long value) const; +}; + +// declarations for isl::pw_aff +inline pw_aff manage(__isl_take isl_pw_aff *ptr); +inline pw_aff manage_copy(__isl_keep isl_pw_aff *ptr); + +class pw_aff { + friend inline pw_aff manage(__isl_take isl_pw_aff *ptr); + friend inline pw_aff manage_copy(__isl_keep isl_pw_aff *ptr); + +protected: + isl_pw_aff *ptr = nullptr; + + inline explicit pw_aff(__isl_take isl_pw_aff *ptr); + +public: + inline /* implicit */ pw_aff(); + inline /* implicit */ pw_aff(const pw_aff &obj); + inline /* implicit */ pw_aff(isl::aff aff); + inline explicit pw_aff(isl::ctx ctx, const std::string &str); + inline explicit pw_aff(isl::set domain, isl::val v); + inline explicit pw_aff(isl::local_space ls); + inline pw_aff &operator=(pw_aff obj); + inline ~pw_aff(); + inline __isl_give isl_pw_aff *copy() const &; + inline __isl_give isl_pw_aff *copy() && = delete; + inline __isl_keep isl_pw_aff *get() const; + inline __isl_give isl_pw_aff *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::multi_pw_aff add(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff add(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_aff add(isl::pw_aff pwaff2) const; + inline isl::pw_multi_aff add(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_aff add(const isl::union_pw_aff &upa2) const; + inline isl::union_pw_multi_aff add(const isl::union_pw_multi_aff &upma2) const; + inline isl::pw_aff add(const isl::aff &pwaff2) const; + inline isl::pw_aff add_constant(isl::val v) const; + inline isl::pw_aff add_constant(long v) const; + inline isl::pw_multi_aff add_constant(const isl::multi_val &mv) const; + inline isl::pw_aff add_dims(isl::dim type, unsigned int n) const; + inline isl::union_pw_multi_aff add_pw_multi_aff(const isl::pw_multi_aff &pma) const; + inline isl::union_pw_multi_aff apply(const isl::union_pw_multi_aff &upma2) const; + inline isl::aff as_aff() const; + inline isl::map as_map() const; + inline isl::multi_aff as_multi_aff() const; + inline isl::multi_union_pw_aff as_multi_union_pw_aff() const; + inline isl::pw_multi_aff as_pw_multi_aff() const; + inline isl::set as_set() const; + inline isl::union_map as_union_map() const; + inline isl::pw_aff at(int pos) const; + inline isl::set bind(const isl::multi_id &tuple) const; + inline isl::set bind(isl::id id) const; + inline isl::set bind(const std::string &id) const; + inline isl::pw_aff bind_domain(isl::multi_id tuple) const; + inline isl::pw_aff bind_domain_wrapped_domain(isl::multi_id tuple) const; + inline isl::pw_aff ceil() const; + inline isl::pw_aff coalesce() const; + inline isl::pw_aff cond(isl::pw_aff pwaff_true, isl::pw_aff pwaff_false) const; + inline class size dim(isl::dim type) const; + inline isl::id dim_id(isl::dim type, unsigned int pos) const; + inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; + inline isl::pw_aff div(isl::pw_aff pa2) const; + inline isl::set domain() const; + inline isl::space domain_space() const; + inline isl::space get_domain_space() const; + inline isl::pw_multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::set eq_set(isl::pw_aff pwaff2) const; + inline isl::val eval(isl::point pnt) const; + inline isl::pw_multi_aff extract_pw_multi_aff(const isl::space &space) const; + inline isl::multi_pw_aff flat_range_product(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff flat_range_product(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff flat_range_product(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff flat_range_product(const isl::union_pw_multi_aff &upma2) const; + inline isl::pw_aff floor() const; + inline stat foreach_piece(const std::function &fn) const; + inline stat foreach_piece(const std::function &fn) const; + inline stat foreach_pw_aff(const std::function &fn) const; + inline isl::set ge_set(isl::pw_aff pwaff2) const; + inline isl::pw_aff gist(isl::set context) const; + inline isl::union_pw_aff gist(const isl::union_set &context) const; + inline isl::pw_aff gist(const isl::basic_set &context) const; + inline isl::pw_aff gist(const isl::point &context) const; + inline isl::set gt_set(isl::pw_aff pwaff2) const; + inline boolean has_range_tuple_id() const; + inline isl::multi_pw_aff identity() const; + inline isl::pw_aff insert_domain(isl::space domain) const; + inline isl::pw_aff intersect_domain(isl::set set) const; + inline isl::union_pw_aff intersect_domain(const isl::space &space) const; + inline isl::union_pw_aff intersect_domain(const isl::union_set &uset) const; + inline isl::pw_aff intersect_domain(const isl::basic_set &set) const; + inline isl::pw_aff intersect_domain(const isl::point &set) const; + inline isl::union_pw_aff intersect_domain_wrapped_domain(const isl::union_set &uset) const; + inline isl::union_pw_aff intersect_domain_wrapped_range(const isl::union_set &uset) const; + inline isl::pw_aff intersect_params(isl::set set) const; + inline boolean involves_locals() const; + inline boolean involves_nan() const; + inline boolean involves_param(const isl::id &id) const; + inline boolean involves_param(const std::string &id) const; + inline boolean involves_param(const isl::id_list &list) const; + inline boolean is_cst() const; + inline boolean is_equal(const isl::pw_aff &pa2) const; + inline boolean isa_aff() const; + inline boolean isa_multi_aff() const; + inline boolean isa_pw_multi_aff() const; + inline isl::set le_set(isl::pw_aff pwaff2) const; + inline isl::pw_aff_list list() const; + inline isl::set lt_set(isl::pw_aff pwaff2) const; + inline isl::multi_pw_aff max(const isl::multi_pw_aff &multi2) const; + inline isl::pw_aff max(isl::pw_aff pwaff2) const; + inline isl::pw_aff max(const isl::aff &pwaff2) const; + inline isl::multi_val max_multi_val() const; + inline isl::multi_pw_aff min(const isl::multi_pw_aff &multi2) const; + inline isl::pw_aff min(isl::pw_aff pwaff2) const; + inline isl::pw_aff min(const isl::aff &pwaff2) const; + inline isl::multi_val min_multi_val() const; + inline isl::pw_aff mod(isl::val mod) const; + inline isl::pw_aff mod(long mod) const; + inline isl::pw_aff mul(isl::pw_aff pwaff2) const; + inline class size n_piece() const; + inline isl::set ne_set(isl::pw_aff pwaff2) const; + inline isl::pw_aff neg() const; + static inline isl::pw_aff param_on_domain(isl::set domain, isl::id id); + inline boolean plain_is_empty() const; + inline boolean plain_is_equal(const isl::multi_pw_aff &multi2) const; + inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff preimage_domain_wrapped_domain(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const; + inline isl::multi_pw_aff product(const isl::multi_pw_aff &multi2) const; + inline isl::pw_multi_aff product(const isl::pw_multi_aff &pma2) const; + inline isl::pw_aff pullback(isl::multi_aff ma) const; + inline isl::pw_aff pullback(isl::multi_pw_aff mpa) const; + inline isl::pw_aff pullback(isl::pw_multi_aff pma) const; + inline isl::union_pw_aff pullback(const isl::union_pw_multi_aff &upma) const; + inline isl::pw_multi_aff_list pw_multi_aff_list() const; + inline isl::pw_multi_aff range_factor_domain() const; + inline isl::pw_multi_aff range_factor_range() const; + inline isl::multi_pw_aff range_product(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff range_product(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff range_product(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_multi_aff range_product(const isl::union_pw_multi_aff &upma2) const; + inline isl::id range_tuple_id() const; + inline isl::multi_pw_aff reset_range_tuple_id() const; + inline isl::multi_pw_aff reset_tuple_id(isl::dim type) const; + inline isl::multi_pw_aff scale(const isl::multi_val &mv) const; + inline isl::pw_aff scale(isl::val v) const; + inline isl::pw_aff scale(long v) const; + inline isl::multi_pw_aff scale_down(const isl::multi_val &mv) const; + inline isl::pw_aff scale_down(isl::val f) const; + inline isl::pw_aff scale_down(long f) const; + inline isl::multi_pw_aff set_at(int pos, const isl::pw_aff &el) const; + inline isl::multi_union_pw_aff set_at(int pos, const isl::union_pw_aff &el) const; + inline isl::multi_pw_aff set_pw_aff(int pos, const isl::pw_aff &el) const; + inline isl::pw_multi_aff set_pw_aff(unsigned int pos, const isl::pw_aff &pa) const; + inline isl::pw_multi_aff set_range_tuple(const isl::id &id) const; + inline isl::pw_multi_aff set_range_tuple(const std::string &id) const; + inline isl::pw_aff set_tuple_id(isl::dim type, isl::id id) const; + inline isl::pw_aff set_tuple_id(isl::dim type, const std::string &id) const; + inline isl::multi_union_pw_aff set_union_pw_aff(int pos, const isl::union_pw_aff &el) const; + inline class size size() const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::multi_pw_aff sub(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff sub(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_aff sub(isl::pw_aff pwaff2) const; + inline isl::pw_multi_aff sub(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_aff sub(const isl::union_pw_aff &upa2) const; + inline isl::union_pw_multi_aff sub(const isl::union_pw_multi_aff &upma2) const; + inline isl::pw_aff sub(const isl::aff &pwaff2) const; + inline isl::pw_aff subtract_domain(isl::set set) const; + inline isl::union_pw_aff subtract_domain(const isl::space &space) const; + inline isl::union_pw_aff subtract_domain(const isl::union_set &uset) const; + inline isl::pw_aff subtract_domain(const isl::basic_set &set) const; + inline isl::pw_aff subtract_domain(const isl::point &set) const; + inline isl::pw_aff tdiv_q(isl::pw_aff pa2) const; + inline isl::pw_aff tdiv_r(isl::pw_aff pa2) const; + inline isl::pw_aff_list to_list() const; + inline isl::multi_pw_aff to_multi_pw_aff() const; + inline isl::union_pw_aff to_union_pw_aff() const; + inline isl::union_pw_multi_aff to_union_pw_multi_aff() const; + inline isl::id tuple_id(isl::dim type) const; + inline isl::id get_tuple_id(isl::dim type) const; + inline isl::multi_pw_aff unbind_params_insert_domain(const isl::multi_id &domain) const; + inline isl::multi_pw_aff union_add(const isl::multi_pw_aff &mpa2) const; + inline isl::multi_union_pw_aff union_add(const isl::multi_union_pw_aff &mupa2) const; + inline isl::pw_aff union_add(isl::pw_aff pwaff2) const; + inline isl::pw_multi_aff union_add(const isl::pw_multi_aff &pma2) const; + inline isl::union_pw_aff union_add(const isl::union_pw_aff &upa2) const; + inline isl::union_pw_multi_aff union_add(const isl::union_pw_multi_aff &upma2) const; + inline isl::pw_aff union_add(const isl::aff &pwaff2) const; + static inline isl::pw_aff var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos); +}; + +// declarations for isl::pw_aff_list +inline pw_aff_list manage(__isl_take isl_pw_aff_list *ptr); +inline pw_aff_list manage_copy(__isl_keep isl_pw_aff_list *ptr); + +class pw_aff_list { + friend inline pw_aff_list manage(__isl_take isl_pw_aff_list *ptr); + friend inline pw_aff_list manage_copy(__isl_keep isl_pw_aff_list *ptr); + +protected: + isl_pw_aff_list *ptr = nullptr; + + inline explicit pw_aff_list(__isl_take isl_pw_aff_list *ptr); + +public: + inline /* implicit */ pw_aff_list(); + inline /* implicit */ pw_aff_list(const pw_aff_list &obj); + inline explicit pw_aff_list(isl::ctx ctx, int n); + inline explicit pw_aff_list(isl::pw_aff el); + inline explicit pw_aff_list(isl::ctx ctx, const std::string &str); + inline pw_aff_list &operator=(pw_aff_list obj); + inline ~pw_aff_list(); + inline __isl_give isl_pw_aff_list *copy() const &; + inline __isl_give isl_pw_aff_list *copy() && = delete; + inline __isl_keep isl_pw_aff_list *get() const; + inline __isl_give isl_pw_aff_list *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::pw_aff_list add(isl::pw_aff el) const; + inline isl::pw_aff at(int index) const; + inline isl::pw_aff get_at(int index) const; + inline isl::pw_aff_list clear() const; + inline isl::pw_aff_list concat(isl::pw_aff_list list2) const; + inline isl::pw_aff_list drop(unsigned int first, unsigned int n) const; + inline stat foreach(const std::function &fn) const; + inline isl::pw_aff_list insert(unsigned int pos, isl::pw_aff el) const; + inline class size size() const; +}; + +// declarations for isl::pw_multi_aff +inline pw_multi_aff manage(__isl_take isl_pw_multi_aff *ptr); +inline pw_multi_aff manage_copy(__isl_keep isl_pw_multi_aff *ptr); + +class pw_multi_aff { + friend inline pw_multi_aff manage(__isl_take isl_pw_multi_aff *ptr); + friend inline pw_multi_aff manage_copy(__isl_keep isl_pw_multi_aff *ptr); + +protected: + isl_pw_multi_aff *ptr = nullptr; + + inline explicit pw_multi_aff(__isl_take isl_pw_multi_aff *ptr); + +public: + inline /* implicit */ pw_multi_aff(); + inline /* implicit */ pw_multi_aff(const pw_multi_aff &obj); + inline /* implicit */ pw_multi_aff(isl::multi_aff ma); + inline /* implicit */ pw_multi_aff(isl::pw_aff pa); + inline explicit pw_multi_aff(isl::ctx ctx, const std::string &str); + inline pw_multi_aff &operator=(pw_multi_aff obj); + inline ~pw_multi_aff(); + inline __isl_give isl_pw_multi_aff *copy() const &; + inline __isl_give isl_pw_multi_aff *copy() && = delete; + inline __isl_keep isl_pw_multi_aff *get() const; + inline __isl_give isl_pw_multi_aff *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::multi_pw_aff add(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff add(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff add(isl::pw_multi_aff pma2) const; + inline isl::union_pw_multi_aff add(const isl::union_pw_multi_aff &upma2) const; + inline isl::pw_multi_aff add(const isl::multi_aff &pma2) const; + inline isl::pw_multi_aff add(const isl::pw_aff &pma2) const; + inline isl::pw_multi_aff add_constant(isl::multi_val mv) const; + inline isl::pw_multi_aff add_constant(isl::val v) const; + inline isl::pw_multi_aff add_constant(long v) const; + inline isl::union_pw_multi_aff add_pw_multi_aff(const isl::pw_multi_aff &pma) const; + inline isl::union_pw_multi_aff apply(const isl::union_pw_multi_aff &upma2) const; + inline isl::map as_map() const; + inline isl::multi_aff as_multi_aff() const; + inline isl::multi_union_pw_aff as_multi_union_pw_aff() const; + inline isl::pw_multi_aff as_pw_multi_aff() const; + inline isl::set as_set() const; + inline isl::union_map as_union_map() const; + inline isl::pw_aff at(int pos) const; + inline isl::pw_aff get_at(int pos) const; + inline isl::set bind(const isl::multi_id &tuple) const; + inline isl::pw_multi_aff bind_domain(isl::multi_id tuple) const; + inline isl::pw_multi_aff bind_domain_wrapped_domain(isl::multi_id tuple) const; + inline isl::pw_multi_aff coalesce() const; + inline class size dim(isl::dim type) const; + inline isl::set domain() const; + static inline isl::pw_multi_aff domain_map(isl::space space); + inline isl::pw_multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::pw_multi_aff extract_pw_multi_aff(const isl::space &space) const; + inline isl::multi_pw_aff flat_range_product(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff flat_range_product(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff flat_range_product(isl::pw_multi_aff pma2) const; + inline isl::union_pw_multi_aff flat_range_product(const isl::union_pw_multi_aff &upma2) const; + inline isl::pw_multi_aff flat_range_product(const isl::multi_aff &pma2) const; + inline isl::pw_multi_aff flat_range_product(const isl::pw_aff &pma2) const; + inline stat foreach_piece(const std::function &fn) const; + static inline isl::pw_multi_aff from_map(isl::map map); + inline isl::pw_multi_aff gist(isl::set set) const; + inline isl::union_pw_multi_aff gist(const isl::union_set &context) const; + inline isl::pw_multi_aff gist(const isl::basic_set &set) const; + inline isl::pw_multi_aff gist(const isl::point &set) const; + inline boolean has_range_tuple_id() const; + inline isl::multi_pw_aff identity() const; + static inline isl::pw_multi_aff identity_on_domain(isl::space space); + inline isl::pw_multi_aff insert_domain(isl::space domain) const; + inline isl::pw_multi_aff intersect_domain(isl::set set) const; + inline isl::union_pw_multi_aff intersect_domain(const isl::space &space) const; + inline isl::union_pw_multi_aff intersect_domain(const isl::union_set &uset) const; + inline isl::pw_multi_aff intersect_domain(const isl::basic_set &set) const; + inline isl::pw_multi_aff intersect_domain(const isl::point &set) const; + inline isl::union_pw_multi_aff intersect_domain_wrapped_domain(const isl::union_set &uset) const; + inline isl::union_pw_multi_aff intersect_domain_wrapped_range(const isl::union_set &uset) const; + inline isl::pw_multi_aff intersect_params(isl::set set) const; + inline boolean involves_locals() const; + inline boolean involves_nan() const; + inline boolean involves_param(const isl::id &id) const; + inline boolean involves_param(const std::string &id) const; + inline boolean involves_param(const isl::id_list &list) const; + inline boolean isa_multi_aff() const; + inline boolean isa_pw_multi_aff() const; + inline isl::pw_aff_list list() const; + inline isl::multi_pw_aff max(const isl::multi_pw_aff &multi2) const; + inline isl::multi_val max_multi_val() const; + inline isl::multi_pw_aff min(const isl::multi_pw_aff &multi2) const; + inline isl::multi_val min_multi_val() const; + static inline isl::pw_multi_aff multi_val_on_domain(isl::set domain, isl::multi_val mv); + inline class size n_piece() const; + inline isl::multi_pw_aff neg() const; + inline boolean plain_is_empty() const; + inline boolean plain_is_equal(const isl::multi_pw_aff &multi2) const; + inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff preimage_domain_wrapped_domain(isl::pw_multi_aff pma2) const; + inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const; + inline isl::pw_multi_aff preimage_domain_wrapped_domain(const isl::multi_aff &pma2) const; + inline isl::pw_multi_aff preimage_domain_wrapped_domain(const isl::pw_aff &pma2) const; + inline isl::multi_pw_aff product(const isl::multi_pw_aff &multi2) const; + inline isl::pw_multi_aff product(isl::pw_multi_aff pma2) const; + inline isl::pw_multi_aff product(const isl::multi_aff &pma2) const; + inline isl::pw_multi_aff product(const isl::pw_aff &pma2) const; + static inline isl::pw_multi_aff project_out_map(isl::space space, isl::dim type, unsigned int first, unsigned int n); + inline isl::multi_pw_aff pullback(const isl::multi_pw_aff &mpa2) const; + inline isl::pw_multi_aff pullback(isl::multi_aff ma) const; + inline isl::pw_multi_aff pullback(isl::pw_multi_aff pma2) const; + inline isl::union_pw_multi_aff pullback(const isl::union_pw_multi_aff &upma2) const; + inline isl::pw_multi_aff_list pw_multi_aff_list() const; + inline isl::pw_multi_aff range_factor_domain() const; + inline isl::pw_multi_aff range_factor_range() const; + static inline isl::pw_multi_aff range_map(isl::space space); + inline isl::multi_pw_aff range_product(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff range_product(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff range_product(isl::pw_multi_aff pma2) const; + inline isl::union_pw_multi_aff range_product(const isl::union_pw_multi_aff &upma2) const; + inline isl::pw_multi_aff range_product(const isl::multi_aff &pma2) const; + inline isl::pw_multi_aff range_product(const isl::pw_aff &pma2) const; + inline isl::id range_tuple_id() const; + inline isl::id get_range_tuple_id() const; + inline isl::multi_pw_aff reset_range_tuple_id() const; + inline isl::multi_pw_aff reset_tuple_id(isl::dim type) const; + inline isl::multi_pw_aff scale(const isl::multi_val &mv) const; + inline isl::pw_multi_aff scale(isl::val v) const; + inline isl::pw_multi_aff scale(long v) const; + inline isl::multi_pw_aff scale_down(const isl::multi_val &mv) const; + inline isl::pw_multi_aff scale_down(isl::val v) const; + inline isl::pw_multi_aff scale_down(long v) const; + inline isl::multi_pw_aff set_at(int pos, const isl::pw_aff &el) const; + inline isl::multi_union_pw_aff set_at(int pos, const isl::union_pw_aff &el) const; + inline isl::multi_pw_aff set_pw_aff(int pos, const isl::pw_aff &el) const; + inline isl::pw_multi_aff set_pw_aff(unsigned int pos, isl::pw_aff pa) const; + inline isl::pw_multi_aff set_range_tuple(isl::id id) const; + inline isl::pw_multi_aff set_range_tuple(const std::string &id) const; + inline isl::multi_union_pw_aff set_union_pw_aff(int pos, const isl::union_pw_aff &el) const; + inline class size size() const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::multi_pw_aff sub(const isl::multi_pw_aff &multi2) const; + inline isl::multi_union_pw_aff sub(const isl::multi_union_pw_aff &multi2) const; + inline isl::pw_multi_aff sub(isl::pw_multi_aff pma2) const; + inline isl::union_pw_multi_aff sub(const isl::union_pw_multi_aff &upma2) const; + inline isl::pw_multi_aff sub(const isl::multi_aff &pma2) const; + inline isl::pw_multi_aff sub(const isl::pw_aff &pma2) const; + inline isl::pw_multi_aff subtract_domain(isl::set set) const; + inline isl::union_pw_multi_aff subtract_domain(const isl::space &space) const; + inline isl::union_pw_multi_aff subtract_domain(const isl::union_set &uset) const; + inline isl::pw_multi_aff subtract_domain(const isl::basic_set &set) const; + inline isl::pw_multi_aff subtract_domain(const isl::point &set) const; + inline isl::pw_multi_aff_list to_list() const; + inline isl::multi_pw_aff to_multi_pw_aff() const; + inline isl::union_pw_multi_aff to_union_pw_multi_aff() const; + inline isl::id tuple_id(isl::dim type) const; + inline isl::id get_tuple_id(isl::dim type) const; + inline isl::multi_pw_aff unbind_params_insert_domain(const isl::multi_id &domain) const; + inline isl::multi_pw_aff union_add(const isl::multi_pw_aff &mpa2) const; + inline isl::multi_union_pw_aff union_add(const isl::multi_union_pw_aff &mupa2) const; + inline isl::pw_multi_aff union_add(isl::pw_multi_aff pma2) const; + inline isl::union_pw_multi_aff union_add(const isl::union_pw_multi_aff &upma2) const; + inline isl::pw_multi_aff union_add(const isl::multi_aff &pma2) const; + inline isl::pw_multi_aff union_add(const isl::pw_aff &pma2) const; + static inline isl::pw_multi_aff zero(isl::space space); +}; + +// declarations for isl::pw_multi_aff_list +inline pw_multi_aff_list manage(__isl_take isl_pw_multi_aff_list *ptr); +inline pw_multi_aff_list manage_copy(__isl_keep isl_pw_multi_aff_list *ptr); + +class pw_multi_aff_list { + friend inline pw_multi_aff_list manage(__isl_take isl_pw_multi_aff_list *ptr); + friend inline pw_multi_aff_list manage_copy(__isl_keep isl_pw_multi_aff_list *ptr); + +protected: + isl_pw_multi_aff_list *ptr = nullptr; + + inline explicit pw_multi_aff_list(__isl_take isl_pw_multi_aff_list *ptr); + +public: + inline /* implicit */ pw_multi_aff_list(); + inline /* implicit */ pw_multi_aff_list(const pw_multi_aff_list &obj); + inline explicit pw_multi_aff_list(isl::ctx ctx, int n); + inline explicit pw_multi_aff_list(isl::pw_multi_aff el); + inline explicit pw_multi_aff_list(isl::ctx ctx, const std::string &str); + inline pw_multi_aff_list &operator=(pw_multi_aff_list obj); + inline ~pw_multi_aff_list(); + inline __isl_give isl_pw_multi_aff_list *copy() const &; + inline __isl_give isl_pw_multi_aff_list *copy() && = delete; + inline __isl_keep isl_pw_multi_aff_list *get() const; + inline __isl_give isl_pw_multi_aff_list *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::pw_multi_aff_list add(isl::pw_multi_aff el) const; + inline isl::pw_multi_aff at(int index) const; + inline isl::pw_multi_aff get_at(int index) const; + inline isl::pw_multi_aff_list clear() const; + inline isl::pw_multi_aff_list concat(isl::pw_multi_aff_list list2) const; + inline isl::pw_multi_aff_list drop(unsigned int first, unsigned int n) const; + inline stat foreach(const std::function &fn) const; + inline isl::pw_multi_aff_list insert(unsigned int pos, isl::pw_multi_aff el) const; + inline class size size() const; +}; + +// declarations for isl::schedule +inline schedule manage(__isl_take isl_schedule *ptr); +inline schedule manage_copy(__isl_keep isl_schedule *ptr); + +class schedule { + friend inline schedule manage(__isl_take isl_schedule *ptr); + friend inline schedule manage_copy(__isl_keep isl_schedule *ptr); + +protected: + isl_schedule *ptr = nullptr; + + inline explicit schedule(__isl_take isl_schedule *ptr); + +public: + inline /* implicit */ schedule(); + inline /* implicit */ schedule(const schedule &obj); + inline explicit schedule(isl::ctx ctx, const std::string &str); + inline schedule &operator=(schedule obj); + inline ~schedule(); + inline __isl_give isl_schedule *copy() const &; + inline __isl_give isl_schedule *copy() && = delete; + inline __isl_keep isl_schedule *get() const; + inline __isl_give isl_schedule *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::schedule align_params(isl::space space) const; + inline isl::union_set domain() const; + inline isl::union_set get_domain() const; + static inline isl::schedule from_domain(isl::union_set domain); + inline isl::schedule gist_domain_params(isl::set context) const; + inline isl::schedule insert_partial_schedule(isl::multi_union_pw_aff partial) const; + inline isl::schedule intersect_domain(isl::union_set domain) const; + inline isl::union_map map() const; + inline isl::union_map get_map() const; + inline isl::schedule pullback(isl::union_pw_multi_aff upma) const; + inline isl::schedule_node root() const; + inline isl::schedule_node get_root() const; + inline isl::schedule sequence(isl::schedule schedule2) const; +}; + +// declarations for isl::schedule_constraints +inline schedule_constraints manage(__isl_take isl_schedule_constraints *ptr); +inline schedule_constraints manage_copy(__isl_keep isl_schedule_constraints *ptr); + +class schedule_constraints { + friend inline schedule_constraints manage(__isl_take isl_schedule_constraints *ptr); + friend inline schedule_constraints manage_copy(__isl_keep isl_schedule_constraints *ptr); + +protected: + isl_schedule_constraints *ptr = nullptr; + + inline explicit schedule_constraints(__isl_take isl_schedule_constraints *ptr); + +public: + inline /* implicit */ schedule_constraints(); + inline /* implicit */ schedule_constraints(const schedule_constraints &obj); + inline explicit schedule_constraints(isl::ctx ctx, const std::string &str); + inline schedule_constraints &operator=(schedule_constraints obj); + inline ~schedule_constraints(); + inline __isl_give isl_schedule_constraints *copy() const &; + inline __isl_give isl_schedule_constraints *copy() && = delete; + inline __isl_keep isl_schedule_constraints *get() const; + inline __isl_give isl_schedule_constraints *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::union_map coincidence() const; + inline isl::union_map get_coincidence() const; + inline isl::schedule compute_schedule() const; + inline isl::union_map conditional_validity() const; + inline isl::union_map get_conditional_validity() const; + inline isl::union_map conditional_validity_condition() const; + inline isl::union_map get_conditional_validity_condition() const; + inline isl::set context() const; + inline isl::set get_context() const; + inline isl::union_set domain() const; + inline isl::union_set get_domain() const; + static inline isl::schedule_constraints on_domain(isl::union_set domain); + inline isl::union_map proximity() const; + inline isl::union_map get_proximity() const; + inline isl::schedule_constraints set_coincidence(isl::union_map coincidence) const; + inline isl::schedule_constraints set_conditional_validity(isl::union_map condition, isl::union_map validity) const; + inline isl::schedule_constraints set_context(isl::set context) const; + inline isl::schedule_constraints set_proximity(isl::union_map proximity) const; + inline isl::schedule_constraints set_validity(isl::union_map validity) const; + inline isl::union_map validity() const; + inline isl::union_map get_validity() const; +}; + +// declarations for isl::schedule_node +inline schedule_node manage(__isl_take isl_schedule_node *ptr); +inline schedule_node manage_copy(__isl_keep isl_schedule_node *ptr); + +class schedule_node { + friend inline schedule_node manage(__isl_take isl_schedule_node *ptr); + friend inline schedule_node manage_copy(__isl_keep isl_schedule_node *ptr); + +protected: + isl_schedule_node *ptr = nullptr; + + inline explicit schedule_node(__isl_take isl_schedule_node *ptr); + +public: + inline /* implicit */ schedule_node(); + inline /* implicit */ schedule_node(const schedule_node &obj); + inline schedule_node &operator=(schedule_node obj); + inline ~schedule_node(); + inline __isl_give isl_schedule_node *copy() const &; + inline __isl_give isl_schedule_node *copy() && = delete; + inline __isl_keep isl_schedule_node *get() const; + inline __isl_give isl_schedule_node *release(); + inline bool is_null() const; +private: + template ::value>::type> + inline boolean isa_type(T subtype) const; +public: + template inline boolean isa() const; + template inline T as() const; + inline isl::ctx ctx() const; + + inline isl::schedule_node ancestor(int generation) const; + inline class size ancestor_child_position(const isl::schedule_node &ancestor) const; + inline class size get_ancestor_child_position(const isl::schedule_node &ancestor) const; + inline isl::schedule_node child(int pos) const; + inline class size child_position() const; + inline class size get_child_position() const; + inline isl::union_set domain() const; + inline isl::union_set get_domain() const; + inline boolean every_descendant(const std::function &test) const; + inline isl::schedule_node first_child() const; + inline stat foreach_ancestor_top_down(const std::function &fn) const; + inline stat foreach_descendant_top_down(const std::function &fn) const; + static inline isl::schedule_node from_domain(isl::union_set domain); + static inline isl::schedule_node from_extension(isl::union_map extension); + inline isl::schedule_node graft_after(isl::schedule_node graft) const; + inline isl::schedule_node graft_before(isl::schedule_node graft) const; + inline boolean has_children() const; + inline boolean has_next_sibling() const; + inline boolean has_parent() const; + inline boolean has_previous_sibling() const; + inline isl::schedule_node insert_context(isl::set context) const; + inline isl::schedule_node insert_filter(isl::union_set filter) const; + inline isl::schedule_node insert_guard(isl::set context) const; + inline isl::schedule_node insert_mark(isl::id mark) const; + inline isl::schedule_node insert_mark(const std::string &mark) const; + inline isl::schedule_node insert_partial_schedule(isl::multi_union_pw_aff schedule) const; + inline isl::schedule_node insert_sequence(isl::union_set_list filters) const; + inline isl::schedule_node insert_set(isl::union_set_list filters) const; + inline boolean is_equal(const isl::schedule_node &node2) const; + inline boolean is_subtree_anchored() const; + inline isl::schedule_node map_descendant_bottom_up(const std::function &fn) const; + inline class size n_children() const; + inline isl::schedule_node next_sibling() const; + inline isl::schedule_node order_after(isl::union_set filter) const; + inline isl::schedule_node order_before(isl::union_set filter) const; + inline isl::schedule_node parent() const; + inline isl::multi_union_pw_aff prefix_schedule_multi_union_pw_aff() const; + inline isl::multi_union_pw_aff get_prefix_schedule_multi_union_pw_aff() const; + inline isl::union_map prefix_schedule_relation() const; + inline isl::union_map get_prefix_schedule_relation() const; + inline isl::union_map prefix_schedule_union_map() const; + inline isl::union_map get_prefix_schedule_union_map() const; + inline isl::union_pw_multi_aff prefix_schedule_union_pw_multi_aff() const; + inline isl::union_pw_multi_aff get_prefix_schedule_union_pw_multi_aff() const; + inline isl::schedule_node previous_sibling() const; + inline isl::schedule_node root() const; + inline isl::schedule schedule() const; + inline isl::schedule get_schedule() const; + inline class size schedule_depth() const; + inline class size get_schedule_depth() const; + inline isl::schedule_node shared_ancestor(const isl::schedule_node &node2) const; + inline isl::schedule_node get_shared_ancestor(const isl::schedule_node &node2) const; + inline class size tree_depth() const; + inline class size get_tree_depth() const; + inline isl::union_set universe_domain() const; + inline isl::union_set get_universe_domain() const; +}; + +// declarations for isl::schedule_node_band + +class schedule_node_band : public schedule_node { + template + friend boolean schedule_node::isa() const; + friend schedule_node_band schedule_node::as() const; + static const auto type = isl_schedule_node_band; + +protected: + inline explicit schedule_node_band(__isl_take isl_schedule_node *ptr); + +public: + inline /* implicit */ schedule_node_band(); + inline /* implicit */ schedule_node_band(const schedule_node_band &obj); + inline schedule_node_band &operator=(schedule_node_band obj); + inline isl::ctx ctx() const; + + inline isl::union_set ast_build_options() const; + inline isl::union_set get_ast_build_options() const; + inline isl::set ast_isolate_option() const; + inline isl::set get_ast_isolate_option() const; + inline boolean member_get_coincident(int pos) const; + inline schedule_node_band member_set_coincident(int pos, int coincident) const; + inline schedule_node_band mod(isl::multi_val mv) const; + inline class size n_member() const; + inline isl::multi_union_pw_aff partial_schedule() const; + inline isl::multi_union_pw_aff get_partial_schedule() const; + inline boolean permutable() const; + inline boolean get_permutable() const; + inline schedule_node_band scale(isl::multi_val mv) const; + inline schedule_node_band scale_down(isl::multi_val mv) const; + inline schedule_node_band set_ast_build_options(isl::union_set options) const; + inline schedule_node_band set_permutable(int permutable) const; + inline schedule_node_band shift(isl::multi_union_pw_aff shift) const; + inline schedule_node_band split(int pos) const; + inline schedule_node_band tile(isl::multi_val sizes) const; + inline schedule_node_band member_set_ast_loop_default(int pos) const; + inline schedule_node_band member_set_ast_loop_atomic(int pos) const; + inline schedule_node_band member_set_ast_loop_unroll(int pos) const; + inline schedule_node_band member_set_ast_loop_separate(int pos) const; +}; + +// declarations for isl::schedule_node_context + +class schedule_node_context : public schedule_node { + template + friend boolean schedule_node::isa() const; + friend schedule_node_context schedule_node::as() const; + static const auto type = isl_schedule_node_context; + +protected: + inline explicit schedule_node_context(__isl_take isl_schedule_node *ptr); + +public: + inline /* implicit */ schedule_node_context(); + inline /* implicit */ schedule_node_context(const schedule_node_context &obj); + inline schedule_node_context &operator=(schedule_node_context obj); + inline isl::ctx ctx() const; + + inline isl::set context() const; + inline isl::set get_context() const; +}; + +// declarations for isl::schedule_node_domain + +class schedule_node_domain : public schedule_node { + template + friend boolean schedule_node::isa() const; + friend schedule_node_domain schedule_node::as() const; + static const auto type = isl_schedule_node_domain; + +protected: + inline explicit schedule_node_domain(__isl_take isl_schedule_node *ptr); + +public: + inline /* implicit */ schedule_node_domain(); + inline /* implicit */ schedule_node_domain(const schedule_node_domain &obj); + inline schedule_node_domain &operator=(schedule_node_domain obj); + inline isl::ctx ctx() const; + + inline isl::union_set domain() const; + inline isl::union_set get_domain() const; +}; + +// declarations for isl::schedule_node_expansion + +class schedule_node_expansion : public schedule_node { + template + friend boolean schedule_node::isa() const; + friend schedule_node_expansion schedule_node::as() const; + static const auto type = isl_schedule_node_expansion; + +protected: + inline explicit schedule_node_expansion(__isl_take isl_schedule_node *ptr); + +public: + inline /* implicit */ schedule_node_expansion(); + inline /* implicit */ schedule_node_expansion(const schedule_node_expansion &obj); + inline schedule_node_expansion &operator=(schedule_node_expansion obj); + inline isl::ctx ctx() const; + + inline isl::union_pw_multi_aff contraction() const; + inline isl::union_pw_multi_aff get_contraction() const; + inline isl::union_map expansion() const; + inline isl::union_map get_expansion() const; +}; + +// declarations for isl::schedule_node_extension + +class schedule_node_extension : public schedule_node { + template + friend boolean schedule_node::isa() const; + friend schedule_node_extension schedule_node::as() const; + static const auto type = isl_schedule_node_extension; + +protected: + inline explicit schedule_node_extension(__isl_take isl_schedule_node *ptr); + +public: + inline /* implicit */ schedule_node_extension(); + inline /* implicit */ schedule_node_extension(const schedule_node_extension &obj); + inline schedule_node_extension &operator=(schedule_node_extension obj); + inline isl::ctx ctx() const; + + inline isl::union_map extension() const; + inline isl::union_map get_extension() const; +}; + +// declarations for isl::schedule_node_filter + +class schedule_node_filter : public schedule_node { + template + friend boolean schedule_node::isa() const; + friend schedule_node_filter schedule_node::as() const; + static const auto type = isl_schedule_node_filter; + +protected: + inline explicit schedule_node_filter(__isl_take isl_schedule_node *ptr); + +public: + inline /* implicit */ schedule_node_filter(); + inline /* implicit */ schedule_node_filter(const schedule_node_filter &obj); + inline schedule_node_filter &operator=(schedule_node_filter obj); + inline isl::ctx ctx() const; + + inline isl::union_set filter() const; + inline isl::union_set get_filter() const; +}; + +// declarations for isl::schedule_node_guard + +class schedule_node_guard : public schedule_node { + template + friend boolean schedule_node::isa() const; + friend schedule_node_guard schedule_node::as() const; + static const auto type = isl_schedule_node_guard; + +protected: + inline explicit schedule_node_guard(__isl_take isl_schedule_node *ptr); + +public: + inline /* implicit */ schedule_node_guard(); + inline /* implicit */ schedule_node_guard(const schedule_node_guard &obj); + inline schedule_node_guard &operator=(schedule_node_guard obj); + inline isl::ctx ctx() const; + + inline isl::set guard() const; + inline isl::set get_guard() const; +}; + +// declarations for isl::schedule_node_leaf + +class schedule_node_leaf : public schedule_node { + template + friend boolean schedule_node::isa() const; + friend schedule_node_leaf schedule_node::as() const; + static const auto type = isl_schedule_node_leaf; + +protected: + inline explicit schedule_node_leaf(__isl_take isl_schedule_node *ptr); + +public: + inline /* implicit */ schedule_node_leaf(); + inline /* implicit */ schedule_node_leaf(const schedule_node_leaf &obj); + inline schedule_node_leaf &operator=(schedule_node_leaf obj); + inline isl::ctx ctx() const; + +}; + +// declarations for isl::schedule_node_mark + +class schedule_node_mark : public schedule_node { + template + friend boolean schedule_node::isa() const; + friend schedule_node_mark schedule_node::as() const; + static const auto type = isl_schedule_node_mark; + +protected: + inline explicit schedule_node_mark(__isl_take isl_schedule_node *ptr); + +public: + inline /* implicit */ schedule_node_mark(); + inline /* implicit */ schedule_node_mark(const schedule_node_mark &obj); + inline schedule_node_mark &operator=(schedule_node_mark obj); + inline isl::ctx ctx() const; + + inline isl::id id() const; + inline isl::id get_id() const; +}; + +// declarations for isl::schedule_node_sequence + +class schedule_node_sequence : public schedule_node { + template + friend boolean schedule_node::isa() const; + friend schedule_node_sequence schedule_node::as() const; + static const auto type = isl_schedule_node_sequence; + +protected: + inline explicit schedule_node_sequence(__isl_take isl_schedule_node *ptr); + +public: + inline /* implicit */ schedule_node_sequence(); + inline /* implicit */ schedule_node_sequence(const schedule_node_sequence &obj); + inline schedule_node_sequence &operator=(schedule_node_sequence obj); + inline isl::ctx ctx() const; + +}; + +// declarations for isl::schedule_node_set + +class schedule_node_set : public schedule_node { + template + friend boolean schedule_node::isa() const; + friend schedule_node_set schedule_node::as() const; + static const auto type = isl_schedule_node_set; + +protected: + inline explicit schedule_node_set(__isl_take isl_schedule_node *ptr); + +public: + inline /* implicit */ schedule_node_set(); + inline /* implicit */ schedule_node_set(const schedule_node_set &obj); + inline schedule_node_set &operator=(schedule_node_set obj); + inline isl::ctx ctx() const; + +}; + +// declarations for isl::set +inline set manage(__isl_take isl_set *ptr); +inline set manage_copy(__isl_keep isl_set *ptr); + +class set { + friend inline set manage(__isl_take isl_set *ptr); + friend inline set manage_copy(__isl_keep isl_set *ptr); + +protected: + isl_set *ptr = nullptr; + + inline explicit set(__isl_take isl_set *ptr); + +public: + inline /* implicit */ set(); + inline /* implicit */ set(const set &obj); + inline /* implicit */ set(isl::basic_set bset); + inline /* implicit */ set(isl::point pnt); + inline explicit set(isl::union_set uset); + inline explicit set(isl::ctx ctx, const std::string &str); + inline set &operator=(set obj); + inline ~set(); + inline __isl_give isl_set *copy() const &; + inline __isl_give isl_set *copy() && = delete; + inline __isl_keep isl_set *get() const; + inline __isl_give isl_set *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::set add_constraint(isl::constraint constraint) const; + inline isl::set add_dims(isl::dim type, unsigned int n) const; + inline isl::basic_set affine_hull() const; + inline isl::set align_params(isl::space model) const; + inline isl::set apply(isl::map map) const; + inline isl::union_set apply(const isl::union_map &umap) const; + inline isl::set apply(const isl::basic_map &map) const; + inline isl::pw_multi_aff as_pw_multi_aff() const; + inline isl::set as_set() const; + inline isl::basic_set_list basic_set_list() const; + inline isl::basic_set_list get_basic_set_list() const; + inline isl::set bind(isl::multi_id tuple) const; + inline isl::set coalesce() const; + inline isl::set complement() const; + inline isl::union_set compute_divs() const; + inline boolean contains(const isl::space &space) const; + inline isl::basic_set convex_hull() const; + inline isl::set detect_equalities() const; + inline class size dim(isl::dim type) const; + inline boolean dim_has_any_lower_bound(isl::dim type, unsigned int pos) const; + inline isl::id dim_id(isl::dim type, unsigned int pos) const; + inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; + inline isl::pw_aff dim_max(int pos) const; + inline isl::val dim_max_val(int pos) const; + inline isl::pw_aff dim_min(int pos) const; + inline isl::val dim_min_val(int pos) const; + inline std::string dim_name(isl::dim type, unsigned int pos) const; + inline std::string get_dim_name(isl::dim type, unsigned int pos) const; + inline isl::set drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::set eliminate(isl::dim type, unsigned int first, unsigned int n) const; + static inline isl::set empty(isl::space space); + inline boolean every_set(const std::function &test) const; + inline isl::set extract_set(const isl::space &space) const; + inline int find_dim_by_id(isl::dim type, const isl::id &id) const; + inline int find_dim_by_id(isl::dim type, const std::string &id) const; + inline isl::set fix_si(isl::dim type, unsigned int pos, int value) const; + inline isl::set flatten() const; + inline stat foreach_basic_set(const std::function &fn) const; + inline stat foreach_point(const std::function &fn) const; + inline stat foreach_set(const std::function &fn) const; + inline isl::set gist(isl::set context) const; + inline isl::union_set gist(const isl::union_set &context) const; + inline isl::set gist(const isl::basic_set &context) const; + inline isl::set gist(const isl::point &context) const; + inline isl::set gist_params(isl::set context) const; + inline boolean has_equal_space(const isl::set &set2) const; + inline isl::map identity() const; + inline isl::union_pw_multi_aff identity_union_pw_multi_aff() const; + inline isl::pw_aff indicator_function() const; + inline isl::set insert_dims(isl::dim type, unsigned int pos, unsigned int n) const; + inline isl::map insert_domain(isl::space domain) const; + inline isl::set intersect(isl::set set2) const; + inline isl::union_set intersect(const isl::union_set &uset2) const; + inline isl::set intersect(const isl::basic_set &set2) const; + inline isl::set intersect(const isl::point &set2) const; + inline isl::set intersect_params(isl::set params) const; + inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; + inline boolean involves_locals() const; + inline boolean is_bounded() const; + inline boolean is_disjoint(const isl::set &set2) const; + inline boolean is_disjoint(const isl::union_set &uset2) const; + inline boolean is_disjoint(const isl::basic_set &set2) const; + inline boolean is_disjoint(const isl::point &set2) const; + inline boolean is_empty() const; + inline boolean is_equal(const isl::set &set2) const; + inline boolean is_equal(const isl::union_set &uset2) const; + inline boolean is_equal(const isl::basic_set &set2) const; + inline boolean is_equal(const isl::point &set2) const; + inline boolean is_params() const; + inline boolean is_singleton() const; + inline boolean is_strict_subset(const isl::set &set2) const; + inline boolean is_strict_subset(const isl::union_set &uset2) const; + inline boolean is_strict_subset(const isl::basic_set &set2) const; + inline boolean is_strict_subset(const isl::point &set2) const; + inline boolean is_subset(const isl::set &set2) const; + inline boolean is_subset(const isl::union_set &uset2) const; + inline boolean is_subset(const isl::basic_set &set2) const; + inline boolean is_subset(const isl::point &set2) const; + inline boolean is_wrapping() const; + inline boolean isa_set() const; + inline isl::set lexmax() const; + inline isl::pw_multi_aff lexmax_pw_multi_aff() const; + inline isl::set lexmin() const; + inline isl::pw_multi_aff lexmin_pw_multi_aff() const; + inline isl::set lower_bound(isl::multi_pw_aff lower) const; + inline isl::set lower_bound(isl::multi_val lower) const; + inline isl::set lower_bound_si(isl::dim type, unsigned int pos, int value) const; + inline isl::set lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const; + inline isl::set lower_bound_val(isl::dim type, unsigned int pos, long value) const; + inline isl::multi_pw_aff max_multi_pw_aff() const; + inline isl::val max_val(const isl::aff &obj) const; + inline isl::multi_pw_aff min_multi_pw_aff() const; + inline isl::val min_val(const isl::aff &obj) const; + inline class size n_basic_set() const; + inline isl::set params() const; + inline isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const; + inline isl::multi_val plain_multi_val_if_fixed() const; + inline isl::multi_val get_plain_multi_val_if_fixed() const; + inline isl::basic_set polyhedral_hull() const; + inline isl::set preimage(isl::multi_aff ma) const; + inline isl::set preimage(isl::multi_pw_aff mpa) const; + inline isl::set preimage(isl::pw_multi_aff pma) const; + inline isl::union_set preimage(const isl::union_pw_multi_aff &upma) const; + inline isl::set product(isl::set set2) const; + inline isl::set project_out(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::set project_out_all_params() const; + inline isl::set project_out_param(isl::id id) const; + inline isl::set project_out_param(const std::string &id) const; + inline isl::set project_out_param(isl::id_list list) const; + inline isl::pw_multi_aff pw_multi_aff_on_domain(isl::multi_val mv) const; + inline isl::set remove_dims(isl::dim type, unsigned int first, unsigned int n) const; + inline isl::set remove_divs() const; + inline isl::set remove_redundancies() const; + inline isl::set reset_tuple_id() const; + inline isl::basic_set sample() const; + inline isl::point sample_point() const; + inline isl::set set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; + inline isl::set set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const; + inline isl::set_list set_list() const; + inline isl::set set_tuple_id(isl::id id) const; + inline isl::set set_tuple_id(const std::string &id) const; + inline isl::fixed_box simple_fixed_box_hull() const; + inline isl::fixed_box get_simple_fixed_box_hull() const; + inline isl::basic_set simple_hull() const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::val stride(int pos) const; + inline isl::val get_stride(int pos) const; + inline isl::set subtract(isl::set set2) const; + inline isl::union_set subtract(const isl::union_set &uset2) const; + inline isl::set subtract(const isl::basic_set &set2) const; + inline isl::set subtract(const isl::point &set2) const; + inline isl::set_list to_list() const; + inline isl::union_set to_union_set() const; + inline isl::map translation() const; + inline class size tuple_dim() const; + inline isl::id tuple_id() const; + inline isl::id get_tuple_id() const; + inline std::string tuple_name() const; + inline std::string get_tuple_name() const; + inline isl::set unbind_params(isl::multi_id tuple) const; + inline isl::map unbind_params_insert_domain(isl::multi_id domain) const; + inline isl::set unite(isl::set set2) const; + inline isl::union_set unite(const isl::union_set &uset2) const; + inline isl::set unite(const isl::basic_set &set2) const; + inline isl::set unite(const isl::point &set2) const; + static inline isl::set universe(isl::space space); + inline isl::basic_set unshifted_simple_hull() const; + inline isl::map unwrap() const; + inline isl::set upper_bound(isl::multi_pw_aff upper) const; + inline isl::set upper_bound(isl::multi_val upper) const; + inline isl::set upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const; + inline isl::set upper_bound_val(isl::dim type, unsigned int pos, long value) const; +}; + +// declarations for isl::set_list +inline set_list manage(__isl_take isl_set_list *ptr); +inline set_list manage_copy(__isl_keep isl_set_list *ptr); + +class set_list { + friend inline set_list manage(__isl_take isl_set_list *ptr); + friend inline set_list manage_copy(__isl_keep isl_set_list *ptr); + +protected: + isl_set_list *ptr = nullptr; + + inline explicit set_list(__isl_take isl_set_list *ptr); + +public: + inline /* implicit */ set_list(); + inline /* implicit */ set_list(const set_list &obj); + inline explicit set_list(isl::ctx ctx, int n); + inline explicit set_list(isl::set el); + inline explicit set_list(isl::ctx ctx, const std::string &str); + inline set_list &operator=(set_list obj); + inline ~set_list(); + inline __isl_give isl_set_list *copy() const &; + inline __isl_give isl_set_list *copy() && = delete; + inline __isl_keep isl_set_list *get() const; + inline __isl_give isl_set_list *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::set_list add(isl::set el) const; + inline isl::set at(int index) const; + inline isl::set get_at(int index) const; + inline isl::set_list clear() const; + inline isl::set_list concat(isl::set_list list2) const; + inline isl::set_list drop(unsigned int first, unsigned int n) const; + inline stat foreach(const std::function &fn) const; + inline isl::set_list insert(unsigned int pos, isl::set el) const; + inline class size size() const; +}; + +// declarations for isl::space +inline space manage(__isl_take isl_space *ptr); +inline space manage_copy(__isl_keep isl_space *ptr); + +class space { + friend inline space manage(__isl_take isl_space *ptr); + friend inline space manage_copy(__isl_keep isl_space *ptr); + +protected: + isl_space *ptr = nullptr; + + inline explicit space(__isl_take isl_space *ptr); + +public: + inline /* implicit */ space(); + inline /* implicit */ space(const space &obj); + inline explicit space(isl::ctx ctx, unsigned int nparam, unsigned int n_in, unsigned int n_out); + inline explicit space(isl::ctx ctx, unsigned int nparam, unsigned int dim); + inline space &operator=(space obj); + inline ~space(); + inline __isl_give isl_space *copy() const &; + inline __isl_give isl_space *copy() && = delete; + inline __isl_keep isl_space *get() const; + inline __isl_give isl_space *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::space add_dims(isl::dim type, unsigned int n) const; + inline isl::space add_named_tuple(isl::id tuple_id, unsigned int dim) const; + inline isl::space add_named_tuple(const std::string &tuple_id, unsigned int dim) const; + inline isl::space add_param(isl::id id) const; + inline isl::space add_param(const std::string &id) const; + inline isl::space add_unnamed_tuple(unsigned int dim) const; + inline isl::space align_params(isl::space space2) const; + inline isl::space curry() const; + inline class size dim(isl::dim type) const; + inline isl::id dim_id(isl::dim type, unsigned int pos) const; + inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; + inline isl::space domain() const; + inline isl::multi_aff domain_map_multi_aff() const; + inline isl::pw_multi_aff domain_map_pw_multi_aff() const; + inline isl::id domain_tuple_id() const; + inline isl::id get_domain_tuple_id() const; + inline isl::space drop_dims(isl::dim type, unsigned int first, unsigned int num) const; + inline int find_dim_by_id(isl::dim type, const isl::id &id) const; + inline int find_dim_by_id(isl::dim type, const std::string &id) const; + inline isl::space flatten_domain() const; + inline isl::space flatten_range() const; + inline boolean has_domain_tuple_id() const; + inline boolean has_equal_tuples(const isl::space &space2) const; + inline boolean has_range_tuple_id() const; + inline boolean has_tuple_id(isl::dim type) const; + inline boolean has_tuple_name(isl::dim type) const; + inline isl::multi_aff identity_multi_aff_on_domain() const; + inline isl::multi_pw_aff identity_multi_pw_aff_on_domain() const; + inline isl::pw_multi_aff identity_pw_multi_aff_on_domain() const; + inline boolean is_equal(const isl::space &space2) const; + inline boolean is_params() const; + inline boolean is_set() const; + inline boolean is_wrapping() const; + inline isl::space map_from_domain_and_range(isl::space range) const; + inline isl::space map_from_set() const; + inline isl::multi_aff multi_aff(isl::aff_list list) const; + inline isl::multi_aff multi_aff_on_domain(isl::multi_val mv) const; + inline isl::multi_id multi_id(isl::id_list list) const; + inline isl::multi_pw_aff multi_pw_aff(isl::pw_aff_list list) const; + inline isl::multi_union_pw_aff multi_union_pw_aff(isl::union_pw_aff_list list) const; + inline isl::multi_val multi_val(isl::val_list list) const; + inline isl::aff param_aff_on_domain(isl::id id) const; + inline isl::aff param_aff_on_domain(const std::string &id) const; + inline isl::space params() const; + static inline isl::space params_alloc(isl::ctx ctx, unsigned int nparam); + inline isl::space product(isl::space right) const; + inline isl::space range() const; + inline isl::multi_aff range_map_multi_aff() const; + inline isl::pw_multi_aff range_map_pw_multi_aff() const; + inline isl::space range_reverse() const; + inline isl::id range_tuple_id() const; + inline isl::id get_range_tuple_id() const; + inline isl::space reverse() const; + inline isl::space set_dim_id(isl::dim type, unsigned int pos, isl::id id) const; + inline isl::space set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const; + inline isl::space set_domain_tuple(isl::id id) const; + inline isl::space set_domain_tuple(const std::string &id) const; + inline isl::space set_from_params() const; + inline isl::space set_range_tuple(isl::id id) const; + inline isl::space set_range_tuple(const std::string &id) const; + inline isl::space set_tuple_id(isl::dim type, isl::id id) const; + inline isl::space set_tuple_id(isl::dim type, const std::string &id) const; + inline isl::id tuple_id(isl::dim type) const; + inline isl::id get_tuple_id(isl::dim type) const; + inline std::string tuple_name(isl::dim type) const; + inline std::string get_tuple_name(isl::dim type) const; + inline isl::space uncurry() const; + static inline isl::space unit(isl::ctx ctx); + inline isl::map universe_map() const; + inline isl::set universe_set() const; + inline isl::space unwrap() const; + inline isl::space wrap() const; + inline isl::aff zero_aff_on_domain() const; + inline isl::multi_aff zero_multi_aff() const; + inline isl::multi_pw_aff zero_multi_pw_aff() const; + inline isl::multi_union_pw_aff zero_multi_union_pw_aff() const; + inline isl::multi_val zero_multi_val() const; +}; + +// declarations for isl::union_access_info +inline union_access_info manage(__isl_take isl_union_access_info *ptr); +inline union_access_info manage_copy(__isl_keep isl_union_access_info *ptr); + +class union_access_info { + friend inline union_access_info manage(__isl_take isl_union_access_info *ptr); + friend inline union_access_info manage_copy(__isl_keep isl_union_access_info *ptr); + +protected: + isl_union_access_info *ptr = nullptr; + + inline explicit union_access_info(__isl_take isl_union_access_info *ptr); + +public: + inline /* implicit */ union_access_info(); + inline /* implicit */ union_access_info(const union_access_info &obj); + inline explicit union_access_info(isl::union_map sink); + inline union_access_info &operator=(union_access_info obj); + inline ~union_access_info(); + inline __isl_give isl_union_access_info *copy() const &; + inline __isl_give isl_union_access_info *copy() && = delete; + inline __isl_keep isl_union_access_info *get() const; + inline __isl_give isl_union_access_info *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::union_flow compute_flow() const; + inline isl::union_access_info set_kill(isl::union_map kill) const; + inline isl::union_access_info set_may_source(isl::union_map may_source) const; + inline isl::union_access_info set_must_source(isl::union_map must_source) const; + inline isl::union_access_info set_schedule(isl::schedule schedule) const; + inline isl::union_access_info set_schedule_map(isl::union_map schedule_map) const; +}; + +// declarations for isl::union_flow +inline union_flow manage(__isl_take isl_union_flow *ptr); +inline union_flow manage_copy(__isl_keep isl_union_flow *ptr); + +class union_flow { + friend inline union_flow manage(__isl_take isl_union_flow *ptr); + friend inline union_flow manage_copy(__isl_keep isl_union_flow *ptr); + +protected: + isl_union_flow *ptr = nullptr; + + inline explicit union_flow(__isl_take isl_union_flow *ptr); + +public: + inline /* implicit */ union_flow(); + inline /* implicit */ union_flow(const union_flow &obj); + inline union_flow &operator=(union_flow obj); + inline ~union_flow(); + inline __isl_give isl_union_flow *copy() const &; + inline __isl_give isl_union_flow *copy() && = delete; + inline __isl_keep isl_union_flow *get() const; + inline __isl_give isl_union_flow *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::union_map full_may_dependence() const; + inline isl::union_map get_full_may_dependence() const; + inline isl::union_map full_must_dependence() const; + inline isl::union_map get_full_must_dependence() const; + inline isl::union_map may_dependence() const; + inline isl::union_map get_may_dependence() const; + inline isl::union_map may_no_source() const; + inline isl::union_map get_may_no_source() const; + inline isl::union_map must_dependence() const; + inline isl::union_map get_must_dependence() const; + inline isl::union_map must_no_source() const; + inline isl::union_map get_must_no_source() const; +}; + +// declarations for isl::union_map +inline union_map manage(__isl_take isl_union_map *ptr); +inline union_map manage_copy(__isl_keep isl_union_map *ptr); + +class union_map { + friend inline union_map manage(__isl_take isl_union_map *ptr); + friend inline union_map manage_copy(__isl_keep isl_union_map *ptr); + +protected: + isl_union_map *ptr = nullptr; + + inline explicit union_map(__isl_take isl_union_map *ptr); + +public: + inline /* implicit */ union_map(); + inline /* implicit */ union_map(const union_map &obj); + inline /* implicit */ union_map(isl::basic_map bmap); + inline /* implicit */ union_map(isl::map map); + inline explicit union_map(isl::ctx ctx, const std::string &str); + inline union_map &operator=(union_map obj); + inline ~union_map(); + inline __isl_give isl_union_map *copy() const &; + inline __isl_give isl_union_map *copy() && = delete; + inline __isl_keep isl_union_map *get() const; + inline __isl_give isl_union_map *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::union_map affine_hull() const; + inline isl::union_map apply_domain(isl::union_map umap2) const; + inline isl::union_map apply_range(isl::union_map umap2) const; + inline isl::map as_map() const; + inline isl::multi_union_pw_aff as_multi_union_pw_aff() const; + inline isl::union_pw_multi_aff as_union_pw_multi_aff() const; + inline isl::union_set bind_range(isl::multi_id tuple) const; + inline isl::union_map coalesce() const; + inline isl::union_map compute_divs() const; + inline isl::union_map curry() const; + inline isl::union_set deltas() const; + inline isl::union_map detect_equalities() const; + inline isl::union_set domain() const; + inline isl::union_map domain_factor_domain() const; + inline isl::union_map domain_factor_range() const; + inline isl::union_map domain_map() const; + inline isl::union_pw_multi_aff domain_map_union_pw_multi_aff() const; + inline isl::union_map domain_product(isl::union_map umap2) const; + static inline isl::union_map empty(isl::ctx ctx); + inline isl::union_map eq_at(isl::multi_union_pw_aff mupa) const; + inline boolean every_map(const std::function &test) const; + inline isl::map extract_map(isl::space space) const; + inline isl::union_map factor_domain() const; + inline isl::union_map factor_range() const; + inline isl::union_map fixed_power(isl::val exp) const; + inline isl::union_map fixed_power(long exp) const; + inline isl::union_map flat_range_product(isl::union_map umap2) const; + inline stat foreach_map(const std::function &fn) const; + static inline isl::union_map from(isl::multi_union_pw_aff mupa); + static inline isl::union_map from(isl::union_pw_multi_aff upma); + static inline isl::union_map from_domain(isl::union_set uset); + static inline isl::union_map from_domain_and_range(isl::union_set domain, isl::union_set range); + static inline isl::union_map from_range(isl::union_set uset); + inline isl::union_map gist(isl::union_map context) const; + inline isl::union_map gist_domain(isl::union_set uset) const; + inline isl::union_map gist_params(isl::set set) const; + inline isl::union_map gist_range(isl::union_set uset) const; + inline isl::union_map intersect(isl::union_map umap2) const; + inline isl::union_map intersect_domain(isl::space space) const; + inline isl::union_map intersect_domain(isl::union_set uset) const; + inline isl::union_map intersect_domain_factor_domain(isl::union_map factor) const; + inline isl::union_map intersect_domain_factor_range(isl::union_map factor) const; + inline isl::union_map intersect_params(isl::set set) const; + inline isl::union_map intersect_range(isl::space space) const; + inline isl::union_map intersect_range(isl::union_set uset) const; + inline isl::union_map intersect_range_factor_domain(isl::union_map factor) const; + inline isl::union_map intersect_range_factor_range(isl::union_map factor) const; + inline boolean is_bijective() const; + inline boolean is_disjoint(const isl::union_map &umap2) const; + inline boolean is_empty() const; + inline boolean is_equal(const isl::union_map &umap2) const; + inline boolean is_injective() const; + inline boolean is_single_valued() const; + inline boolean is_strict_subset(const isl::union_map &umap2) const; + inline boolean is_subset(const isl::union_map &umap2) const; + inline boolean isa_map() const; + inline isl::union_map lexmax() const; + inline isl::union_map lexmin() const; + inline isl::map_list map_list() const; + inline isl::map_list get_map_list() const; + inline isl::set params() const; + inline isl::union_map polyhedral_hull() const; + inline isl::union_map preimage_domain(isl::multi_aff ma) const; + inline isl::union_map preimage_domain(isl::multi_pw_aff mpa) const; + inline isl::union_map preimage_domain(isl::pw_multi_aff pma) const; + inline isl::union_map preimage_domain(isl::union_pw_multi_aff upma) const; + inline isl::union_map preimage_range(isl::multi_aff ma) const; + inline isl::union_map preimage_range(isl::pw_multi_aff pma) const; + inline isl::union_map preimage_range(isl::union_pw_multi_aff upma) const; + inline isl::union_map product(isl::union_map umap2) const; + inline isl::union_map project_out_all_params() const; + inline isl::union_set range() const; + inline isl::union_map range_factor_domain() const; + inline isl::union_map range_factor_range() const; + inline isl::union_map range_map() const; + inline isl::union_map range_product(isl::union_map umap2) const; + inline isl::union_map range_reverse() const; + inline isl::union_map reverse() const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::union_map subtract(isl::union_map umap2) const; + inline isl::union_map subtract_domain(isl::union_set dom) const; + inline isl::union_map subtract_range(isl::union_set dom) const; + inline isl::union_map uncurry() const; + inline isl::union_map unite(isl::union_map umap2) const; + inline isl::union_map universe() const; + inline isl::union_set wrap() const; + inline isl::union_map zip() const; +}; + +// declarations for isl::union_pw_aff +inline union_pw_aff manage(__isl_take isl_union_pw_aff *ptr); +inline union_pw_aff manage_copy(__isl_keep isl_union_pw_aff *ptr); + +class union_pw_aff { + friend inline union_pw_aff manage(__isl_take isl_union_pw_aff *ptr); + friend inline union_pw_aff manage_copy(__isl_keep isl_union_pw_aff *ptr); + +protected: + isl_union_pw_aff *ptr = nullptr; + + inline explicit union_pw_aff(__isl_take isl_union_pw_aff *ptr); + +public: + inline /* implicit */ union_pw_aff(); + inline /* implicit */ union_pw_aff(const union_pw_aff &obj); + inline /* implicit */ union_pw_aff(isl::aff aff); + inline /* implicit */ union_pw_aff(isl::pw_aff pa); + inline explicit union_pw_aff(isl::ctx ctx, const std::string &str); + inline explicit union_pw_aff(isl::union_set domain, isl::val v); + inline union_pw_aff &operator=(union_pw_aff obj); + inline ~union_pw_aff(); + inline __isl_give isl_union_pw_aff *copy() const &; + inline __isl_give isl_union_pw_aff *copy() && = delete; + inline __isl_keep isl_union_pw_aff *get() const; + inline __isl_give isl_union_pw_aff *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::multi_union_pw_aff add(const isl::multi_union_pw_aff &multi2) const; + inline isl::union_pw_aff add(isl::union_pw_aff upa2) const; + inline isl::union_pw_multi_aff add(const isl::union_pw_multi_aff &upma2) const; + inline isl::union_pw_aff add(const isl::aff &upa2) const; + inline isl::union_pw_aff add(const isl::pw_aff &upa2) const; + inline isl::union_pw_multi_aff add_pw_multi_aff(const isl::pw_multi_aff &pma) const; + inline isl::union_pw_multi_aff apply(const isl::union_pw_multi_aff &upma2) const; + inline isl::multi_union_pw_aff as_multi_union_pw_aff() const; + inline isl::pw_multi_aff as_pw_multi_aff() const; + inline isl::union_map as_union_map() const; + inline isl::union_pw_aff at(int pos) const; + inline isl::union_set bind(const isl::multi_id &tuple) const; + inline isl::union_set bind(isl::id id) const; + inline isl::union_set bind(const std::string &id) const; + inline isl::union_pw_aff coalesce() const; + inline class size dim(isl::dim type) const; + inline isl::union_set domain() const; + static inline isl::union_pw_aff empty(isl::space space); + inline isl::pw_multi_aff extract_pw_multi_aff(const isl::space &space) const; + inline isl::multi_union_pw_aff flat_range_product(const isl::multi_union_pw_aff &multi2) const; + inline isl::union_pw_multi_aff flat_range_product(const isl::union_pw_multi_aff &upma2) const; + inline stat foreach_pw_aff(const std::function &fn) const; + inline isl::union_pw_aff gist(isl::union_set context) const; + inline boolean has_range_tuple_id() const; + inline isl::union_pw_aff intersect_domain(isl::space space) const; + inline isl::union_pw_aff intersect_domain(isl::union_set uset) const; + inline isl::union_pw_aff intersect_domain_wrapped_domain(isl::union_set uset) const; + inline isl::union_pw_aff intersect_domain_wrapped_range(isl::union_set uset) const; + inline isl::union_pw_aff intersect_params(isl::set set) const; + inline boolean involves_locals() const; + inline boolean involves_nan() const; + inline boolean isa_pw_multi_aff() const; + inline isl::union_pw_aff_list list() const; + inline isl::multi_union_pw_aff neg() const; + inline boolean plain_is_empty() const; + inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const; + inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const; + inline isl::union_pw_aff pullback(isl::union_pw_multi_aff upma) const; + inline isl::pw_multi_aff_list pw_multi_aff_list() const; + inline isl::union_pw_multi_aff range_factor_domain() const; + inline isl::union_pw_multi_aff range_factor_range() const; + inline isl::multi_union_pw_aff range_product(const isl::multi_union_pw_aff &multi2) const; + inline isl::union_pw_multi_aff range_product(const isl::union_pw_multi_aff &upma2) const; + inline isl::id range_tuple_id() const; + inline isl::multi_union_pw_aff reset_range_tuple_id() const; + inline isl::multi_union_pw_aff reset_tuple_id(isl::dim type) const; + inline isl::multi_union_pw_aff scale(const isl::multi_val &mv) const; + inline isl::multi_union_pw_aff scale(const isl::val &v) const; + inline isl::multi_union_pw_aff scale(long v) const; + inline isl::multi_union_pw_aff scale_down(const isl::multi_val &mv) const; + inline isl::multi_union_pw_aff scale_down(const isl::val &v) const; + inline isl::multi_union_pw_aff scale_down(long v) const; + inline isl::multi_union_pw_aff set_at(int pos, const isl::union_pw_aff &el) const; + inline isl::multi_union_pw_aff set_range_tuple(const isl::id &id) const; + inline isl::multi_union_pw_aff set_range_tuple(const std::string &id) const; + inline isl::multi_union_pw_aff set_union_pw_aff(int pos, const isl::union_pw_aff &el) const; + inline class size size() const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::multi_union_pw_aff sub(const isl::multi_union_pw_aff &multi2) const; + inline isl::union_pw_aff sub(isl::union_pw_aff upa2) const; + inline isl::union_pw_multi_aff sub(const isl::union_pw_multi_aff &upma2) const; + inline isl::union_pw_aff sub(const isl::aff &upa2) const; + inline isl::union_pw_aff sub(const isl::pw_aff &upa2) const; + inline isl::union_pw_aff subtract_domain(isl::space space) const; + inline isl::union_pw_aff subtract_domain(isl::union_set uset) const; + inline isl::union_pw_aff_list to_list() const; + inline isl::multi_union_pw_aff union_add(const isl::multi_union_pw_aff &mupa2) const; + inline isl::union_pw_aff union_add(isl::union_pw_aff upa2) const; + inline isl::union_pw_multi_aff union_add(const isl::union_pw_multi_aff &upma2) const; + inline isl::union_pw_aff union_add(const isl::aff &upa2) const; + inline isl::union_pw_aff union_add(const isl::pw_aff &upa2) const; +}; + +// declarations for isl::union_pw_aff_list +inline union_pw_aff_list manage(__isl_take isl_union_pw_aff_list *ptr); +inline union_pw_aff_list manage_copy(__isl_keep isl_union_pw_aff_list *ptr); + +class union_pw_aff_list { + friend inline union_pw_aff_list manage(__isl_take isl_union_pw_aff_list *ptr); + friend inline union_pw_aff_list manage_copy(__isl_keep isl_union_pw_aff_list *ptr); + +protected: + isl_union_pw_aff_list *ptr = nullptr; + + inline explicit union_pw_aff_list(__isl_take isl_union_pw_aff_list *ptr); + +public: + inline /* implicit */ union_pw_aff_list(); + inline /* implicit */ union_pw_aff_list(const union_pw_aff_list &obj); + inline explicit union_pw_aff_list(isl::ctx ctx, int n); + inline explicit union_pw_aff_list(isl::union_pw_aff el); + inline explicit union_pw_aff_list(isl::ctx ctx, const std::string &str); + inline union_pw_aff_list &operator=(union_pw_aff_list obj); + inline ~union_pw_aff_list(); + inline __isl_give isl_union_pw_aff_list *copy() const &; + inline __isl_give isl_union_pw_aff_list *copy() && = delete; + inline __isl_keep isl_union_pw_aff_list *get() const; + inline __isl_give isl_union_pw_aff_list *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::union_pw_aff_list add(isl::union_pw_aff el) const; + inline isl::union_pw_aff at(int index) const; + inline isl::union_pw_aff get_at(int index) const; + inline isl::union_pw_aff_list clear() const; + inline isl::union_pw_aff_list concat(isl::union_pw_aff_list list2) const; + inline isl::union_pw_aff_list drop(unsigned int first, unsigned int n) const; + inline stat foreach(const std::function &fn) const; + inline isl::union_pw_aff_list insert(unsigned int pos, isl::union_pw_aff el) const; + inline class size size() const; +}; + +// declarations for isl::union_pw_multi_aff +inline union_pw_multi_aff manage(__isl_take isl_union_pw_multi_aff *ptr); +inline union_pw_multi_aff manage_copy(__isl_keep isl_union_pw_multi_aff *ptr); + +class union_pw_multi_aff { + friend inline union_pw_multi_aff manage(__isl_take isl_union_pw_multi_aff *ptr); + friend inline union_pw_multi_aff manage_copy(__isl_keep isl_union_pw_multi_aff *ptr); + +protected: + isl_union_pw_multi_aff *ptr = nullptr; + + inline explicit union_pw_multi_aff(__isl_take isl_union_pw_multi_aff *ptr); + +public: + inline /* implicit */ union_pw_multi_aff(); + inline /* implicit */ union_pw_multi_aff(const union_pw_multi_aff &obj); + inline explicit union_pw_multi_aff(isl::union_set uset); + inline /* implicit */ union_pw_multi_aff(isl::multi_aff ma); + inline /* implicit */ union_pw_multi_aff(isl::pw_multi_aff pma); + inline explicit union_pw_multi_aff(isl::union_map umap); + inline /* implicit */ union_pw_multi_aff(isl::union_pw_aff upa); + inline explicit union_pw_multi_aff(isl::ctx ctx, const std::string &str); + inline union_pw_multi_aff &operator=(union_pw_multi_aff obj); + inline ~union_pw_multi_aff(); + inline __isl_give isl_union_pw_multi_aff *copy() const &; + inline __isl_give isl_union_pw_multi_aff *copy() && = delete; + inline __isl_keep isl_union_pw_multi_aff *get() const; + inline __isl_give isl_union_pw_multi_aff *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::union_pw_multi_aff add(isl::union_pw_multi_aff upma2) const; + inline isl::union_pw_multi_aff add_pw_multi_aff(isl::pw_multi_aff pma) const; + inline isl::union_pw_multi_aff apply(isl::union_pw_multi_aff upma2) const; + inline isl::multi_union_pw_aff as_multi_union_pw_aff() const; + inline isl::pw_multi_aff as_pw_multi_aff() const; + inline isl::union_map as_union_map() const; + inline isl::union_pw_multi_aff coalesce() const; + inline isl::union_set domain() const; + static inline isl::union_pw_multi_aff empty(isl::space space); + static inline isl::union_pw_multi_aff empty(isl::ctx ctx); + inline isl::pw_multi_aff extract_pw_multi_aff(isl::space space) const; + inline isl::union_pw_multi_aff flat_range_product(isl::union_pw_multi_aff upma2) const; + inline isl::union_pw_multi_aff gist(isl::union_set context) const; + inline isl::union_pw_multi_aff intersect_domain(isl::space space) const; + inline isl::union_pw_multi_aff intersect_domain(isl::union_set uset) const; + inline isl::union_pw_multi_aff intersect_domain_wrapped_domain(isl::union_set uset) const; + inline isl::union_pw_multi_aff intersect_domain_wrapped_range(isl::union_set uset) const; + inline isl::union_pw_multi_aff intersect_params(isl::set set) const; + inline boolean involves_locals() const; + inline boolean isa_pw_multi_aff() const; + inline boolean plain_is_empty() const; + inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(isl::union_pw_multi_aff upma2) const; + inline isl::union_pw_multi_aff pullback(isl::union_pw_multi_aff upma2) const; + inline isl::pw_multi_aff_list pw_multi_aff_list() const; + inline isl::pw_multi_aff_list get_pw_multi_aff_list() const; + inline isl::union_pw_multi_aff range_factor_domain() const; + inline isl::union_pw_multi_aff range_factor_range() const; + inline isl::union_pw_multi_aff range_product(isl::union_pw_multi_aff upma2) const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::union_pw_multi_aff sub(isl::union_pw_multi_aff upma2) const; + inline isl::union_pw_multi_aff subtract_domain(isl::space space) const; + inline isl::union_pw_multi_aff subtract_domain(isl::union_set uset) const; + inline isl::union_pw_multi_aff union_add(isl::union_pw_multi_aff upma2) const; +}; + +// declarations for isl::union_set +inline union_set manage(__isl_take isl_union_set *ptr); +inline union_set manage_copy(__isl_keep isl_union_set *ptr); + +class union_set { + friend inline union_set manage(__isl_take isl_union_set *ptr); + friend inline union_set manage_copy(__isl_keep isl_union_set *ptr); + +protected: + isl_union_set *ptr = nullptr; + + inline explicit union_set(__isl_take isl_union_set *ptr); + +public: + inline /* implicit */ union_set(); + inline /* implicit */ union_set(const union_set &obj); + inline /* implicit */ union_set(isl::basic_set bset); + inline /* implicit */ union_set(isl::point pnt); + inline /* implicit */ union_set(isl::set set); + inline explicit union_set(isl::ctx ctx, const std::string &str); + inline union_set &operator=(union_set obj); + inline ~union_set(); + inline __isl_give isl_union_set *copy() const &; + inline __isl_give isl_union_set *copy() && = delete; + inline __isl_keep isl_union_set *get() const; + inline __isl_give isl_union_set *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::union_set affine_hull() const; + inline isl::union_set apply(isl::union_map umap) const; + inline isl::set as_set() const; + inline isl::union_set coalesce() const; + inline isl::union_set compute_divs() const; + inline boolean contains(const isl::space &space) const; + inline isl::union_set detect_equalities() const; + static inline isl::union_set empty(isl::ctx ctx); + inline boolean every_set(const std::function &test) const; + inline isl::set extract_set(isl::space space) const; + inline stat foreach_point(const std::function &fn) const; + inline stat foreach_set(const std::function &fn) const; + inline isl::union_set gist(isl::union_set context) const; + inline isl::union_set gist_params(isl::set set) const; + inline isl::union_map identity() const; + inline isl::union_pw_multi_aff identity_union_pw_multi_aff() const; + inline isl::union_set intersect(isl::union_set uset2) const; + inline isl::union_set intersect_params(isl::set set) const; + inline boolean is_disjoint(const isl::union_set &uset2) const; + inline boolean is_empty() const; + inline boolean is_equal(const isl::union_set &uset2) const; + inline boolean is_strict_subset(const isl::union_set &uset2) const; + inline boolean is_subset(const isl::union_set &uset2) const; + inline boolean isa_set() const; + inline isl::union_set lexmax() const; + inline isl::union_set lexmin() const; + inline isl::set params() const; + inline isl::union_set polyhedral_hull() const; + inline isl::union_set preimage(isl::multi_aff ma) const; + inline isl::union_set preimage(isl::pw_multi_aff pma) const; + inline isl::union_set preimage(isl::union_pw_multi_aff upma) const; + inline isl::point sample_point() const; + inline isl::set_list set_list() const; + inline isl::set_list get_set_list() const; + inline isl::space space() const; + inline isl::space get_space() const; + inline isl::union_set subtract(isl::union_set uset2) const; + inline isl::union_set_list to_list() const; + inline isl::union_set unite(isl::union_set uset2) const; + inline isl::union_set universe() const; + inline isl::union_map unwrap() const; +}; + +// declarations for isl::union_set_list +inline union_set_list manage(__isl_take isl_union_set_list *ptr); +inline union_set_list manage_copy(__isl_keep isl_union_set_list *ptr); + +class union_set_list { + friend inline union_set_list manage(__isl_take isl_union_set_list *ptr); + friend inline union_set_list manage_copy(__isl_keep isl_union_set_list *ptr); + +protected: + isl_union_set_list *ptr = nullptr; + + inline explicit union_set_list(__isl_take isl_union_set_list *ptr); + +public: + inline /* implicit */ union_set_list(); + inline /* implicit */ union_set_list(const union_set_list &obj); + inline explicit union_set_list(isl::ctx ctx, int n); + inline explicit union_set_list(isl::union_set el); + inline explicit union_set_list(isl::ctx ctx, const std::string &str); + inline union_set_list &operator=(union_set_list obj); + inline ~union_set_list(); + inline __isl_give isl_union_set_list *copy() const &; + inline __isl_give isl_union_set_list *copy() && = delete; + inline __isl_keep isl_union_set_list *get() const; + inline __isl_give isl_union_set_list *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::union_set_list add(isl::union_set el) const; + inline isl::union_set at(int index) const; + inline isl::union_set get_at(int index) const; + inline isl::union_set_list clear() const; + inline isl::union_set_list concat(isl::union_set_list list2) const; + inline isl::union_set_list drop(unsigned int first, unsigned int n) const; + inline stat foreach(const std::function &fn) const; + inline isl::union_set_list insert(unsigned int pos, isl::union_set el) const; + inline class size size() const; +}; + +// declarations for isl::val +inline val manage(__isl_take isl_val *ptr); +inline val manage_copy(__isl_keep isl_val *ptr); + +class val { + friend inline val manage(__isl_take isl_val *ptr); + friend inline val manage_copy(__isl_keep isl_val *ptr); + +protected: + isl_val *ptr = nullptr; + + inline explicit val(__isl_take isl_val *ptr); + +public: + inline /* implicit */ val(); + inline /* implicit */ val(const val &obj); + inline explicit val(isl::ctx ctx, long i); + inline explicit val(isl::ctx ctx, const std::string &str); + inline val &operator=(val obj); + inline ~val(); + inline __isl_give isl_val *copy() const &; + inline __isl_give isl_val *copy() && = delete; + inline __isl_keep isl_val *get() const; + inline __isl_give isl_val *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::val abs() const; + inline boolean abs_eq(const isl::val &v2) const; + inline boolean abs_eq(long v2) const; + inline isl::val add(isl::val v2) const; + inline isl::val add(long v2) const; + inline isl::val ceil() const; + inline int cmp_si(long i) const; + inline long den_si() const; + inline long get_den_si() const; + inline isl::val div(isl::val v2) const; + inline isl::val div(long v2) const; + inline boolean eq(const isl::val &v2) const; + inline boolean eq(long v2) const; + inline isl::val floor() const; + inline isl::val gcd(isl::val v2) const; + inline isl::val gcd(long v2) const; + inline boolean ge(const isl::val &v2) const; + inline boolean ge(long v2) const; + inline boolean gt(const isl::val &v2) const; + inline boolean gt(long v2) const; + static inline isl::val infty(isl::ctx ctx); + static inline isl::val int_from_ui(isl::ctx ctx, unsigned long u); + inline isl::val inv() const; + inline boolean is_divisible_by(const isl::val &v2) const; + inline boolean is_divisible_by(long v2) const; + inline boolean is_infty() const; + inline boolean is_int() const; + inline boolean is_nan() const; + inline boolean is_neg() const; + inline boolean is_neginfty() const; + inline boolean is_negone() const; + inline boolean is_nonneg() const; + inline boolean is_nonpos() const; + inline boolean is_one() const; + inline boolean is_pos() const; + inline boolean is_rat() const; + inline boolean is_zero() const; + inline boolean le(const isl::val &v2) const; + inline boolean le(long v2) const; + inline boolean lt(const isl::val &v2) const; + inline boolean lt(long v2) const; + inline isl::val max(isl::val v2) const; + inline isl::val max(long v2) const; + inline isl::val min(isl::val v2) const; + inline isl::val min(long v2) const; + inline isl::val mod(isl::val v2) const; + inline isl::val mod(long v2) const; + inline isl::val mul(isl::val v2) const; + inline isl::val mul(long v2) const; + static inline isl::val nan(isl::ctx ctx); + inline boolean ne(const isl::val &v2) const; + inline boolean ne(long v2) const; + inline isl::val neg() const; + static inline isl::val neginfty(isl::ctx ctx); + static inline isl::val negone(isl::ctx ctx); + inline long num_si() const; + inline long get_num_si() const; + static inline isl::val one(isl::ctx ctx); + inline isl::val pow2() const; + inline int sgn() const; + inline isl::val sub(isl::val v2) const; + inline isl::val sub(long v2) const; + inline isl::val sub_ui(unsigned long v2) const; + inline isl::val_list to_list() const; + inline isl::val trunc() const; + static inline isl::val zero(isl::ctx ctx); +}; + +// declarations for isl::val_list +inline val_list manage(__isl_take isl_val_list *ptr); +inline val_list manage_copy(__isl_keep isl_val_list *ptr); + +class val_list { + friend inline val_list manage(__isl_take isl_val_list *ptr); + friend inline val_list manage_copy(__isl_keep isl_val_list *ptr); + +protected: + isl_val_list *ptr = nullptr; + + inline explicit val_list(__isl_take isl_val_list *ptr); + +public: + inline /* implicit */ val_list(); + inline /* implicit */ val_list(const val_list &obj); + inline explicit val_list(isl::ctx ctx, int n); + inline explicit val_list(isl::val el); + inline explicit val_list(isl::ctx ctx, const std::string &str); + inline val_list &operator=(val_list obj); + inline ~val_list(); + inline __isl_give isl_val_list *copy() const &; + inline __isl_give isl_val_list *copy() && = delete; + inline __isl_keep isl_val_list *get() const; + inline __isl_give isl_val_list *release(); + inline bool is_null() const; + inline isl::ctx ctx() const; + + inline isl::val_list add(isl::val el) const; + inline isl::val_list add(long el) const; + inline isl::val at(int index) const; + inline isl::val get_at(int index) const; + inline isl::val_list clear() const; + inline isl::val_list concat(isl::val_list list2) const; + inline isl::val_list drop(unsigned int first, unsigned int n) const; + inline stat foreach(const std::function &fn) const; + inline isl::val_list insert(unsigned int pos, isl::val el) const; + inline isl::val_list insert(unsigned int pos, long el) const; + inline class size size() const; +}; + +// implementations for isl::aff +aff manage(__isl_take isl_aff *ptr) { + return aff(ptr); +} +aff manage_copy(__isl_keep isl_aff *ptr) { + ptr = isl_aff_copy(ptr); + return aff(ptr); +} + +aff::aff() + : ptr(nullptr) {} + +aff::aff(const aff &obj) + : ptr(nullptr) +{ + ptr = obj.copy(); +} + +aff::aff(__isl_take isl_aff *ptr) + : ptr(ptr) {} + +aff::aff(isl::ctx ctx, const std::string &str) +{ + auto res = isl_aff_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} + +aff::aff(isl::local_space ls, isl::val val) +{ + auto res = isl_aff_val_on_domain(ls.release(), val.release()); + ptr = res; +} + +aff::aff(isl::local_space ls) +{ + auto res = isl_aff_zero_on_domain(ls.release()); + ptr = res; +} + +aff &aff::operator=(aff obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +aff::~aff() { + if (ptr) + isl_aff_free(ptr); +} + +__isl_give isl_aff *aff::copy() const & { + return isl_aff_copy(ptr); +} + +__isl_keep isl_aff *aff::get() const { + return ptr; +} + +__isl_give isl_aff *aff::release() { + isl_aff *tmp = ptr; + ptr = nullptr; + return tmp; +} + +bool aff::is_null() const { + return ptr == nullptr; +} + +isl::ctx aff::ctx() const { + return isl::ctx(isl_aff_get_ctx(ptr)); +} + +isl::aff aff::add(isl::aff aff2) const +{ + auto res = isl_aff_add(copy(), aff2.release()); + return manage(res); +} + +isl::multi_aff aff::add(const isl::multi_aff &multi2) const +{ + return isl::multi_aff(*this).add(multi2); +} + +isl::multi_pw_aff aff::add(const isl::multi_pw_aff &multi2) const +{ + return isl::pw_aff(*this).add(multi2); +} + +isl::multi_union_pw_aff aff::add(const isl::multi_union_pw_aff &multi2) const +{ + return isl::pw_aff(*this).add(multi2); +} + +isl::pw_aff aff::add(const isl::pw_aff &pwaff2) const +{ + return isl::pw_aff(*this).add(pwaff2); +} + +isl::pw_multi_aff aff::add(const isl::pw_multi_aff &pma2) const +{ + return isl::pw_aff(*this).add(pma2); +} + +isl::union_pw_aff aff::add(const isl::union_pw_aff &upa2) const +{ + return isl::pw_aff(*this).add(upa2); +} + +isl::union_pw_multi_aff aff::add(const isl::union_pw_multi_aff &upma2) const +{ + return isl::pw_aff(*this).add(upma2); +} + +isl::aff aff::add_constant(isl::val v) const +{ + auto res = isl_aff_add_constant_val(copy(), v.release()); + return manage(res); +} + +isl::aff aff::add_constant(long v) const +{ + return this->add_constant(isl::val(ctx(), v)); +} + +isl::multi_aff aff::add_constant(const isl::multi_val &mv) const +{ + return isl::multi_aff(*this).add_constant(mv); +} + +isl::aff aff::add_constant_si(int v) const +{ + auto res = isl_aff_add_constant_si(copy(), v); + return manage(res); +} + +isl::pw_aff aff::add_dims(isl::dim type, unsigned int n) const +{ + return isl::pw_aff(*this).add_dims(type, n); +} + +isl::union_pw_multi_aff aff::add_pw_multi_aff(const isl::pw_multi_aff &pma) const +{ + return isl::pw_aff(*this).add_pw_multi_aff(pma); +} + +isl::union_pw_multi_aff aff::apply(const isl::union_pw_multi_aff &upma2) const +{ + return isl::pw_aff(*this).apply(upma2); +} + +isl::aff aff::as_aff() const +{ + return isl::pw_aff(*this).as_aff(); +} + +isl::map aff::as_map() const +{ + return isl::pw_aff(*this).as_map(); +} + +isl::multi_aff aff::as_multi_aff() const +{ + return isl::pw_aff(*this).as_multi_aff(); +} + +isl::multi_union_pw_aff aff::as_multi_union_pw_aff() const +{ + return isl::pw_aff(*this).as_multi_union_pw_aff(); +} + +isl::pw_multi_aff aff::as_pw_multi_aff() const +{ + return isl::pw_aff(*this).as_pw_multi_aff(); +} + +isl::set aff::as_set() const +{ + return isl::multi_aff(*this).as_set(); +} + +isl::union_map aff::as_union_map() const +{ + return isl::pw_aff(*this).as_union_map(); +} + +isl::aff aff::at(int pos) const +{ + return isl::multi_aff(*this).at(pos); +} + +isl::basic_set aff::bind(isl::id id) const +{ + auto res = isl_aff_bind_id(copy(), id.release()); + return manage(res); +} + +isl::basic_set aff::bind(const std::string &id) const +{ + return this->bind(isl::id(ctx(), id)); +} + +isl::basic_set aff::bind(const isl::multi_id &tuple) const +{ + return isl::multi_aff(*this).bind(tuple); +} + +isl::pw_aff aff::bind_domain(const isl::multi_id &tuple) const +{ + return isl::pw_aff(*this).bind_domain(tuple); +} + +isl::pw_aff aff::bind_domain_wrapped_domain(const isl::multi_id &tuple) const +{ + return isl::pw_aff(*this).bind_domain_wrapped_domain(tuple); +} + +isl::aff aff::ceil() const +{ + auto res = isl_aff_ceil(copy()); + return manage(res); +} + +isl::pw_aff aff::coalesce() const +{ + return isl::pw_aff(*this).coalesce(); +} + +isl::pw_aff aff::cond(const isl::pw_aff &pwaff_true, const isl::pw_aff &pwaff_false) const +{ + return isl::pw_aff(*this).cond(pwaff_true, pwaff_false); +} + +isl::multi_val aff::constant_multi_val() const +{ + return isl::multi_aff(*this).constant_multi_val(); +} + +isl::val aff::constant_val() const +{ + auto res = isl_aff_get_constant_val(get()); + return manage(res); +} + +isl::val aff::get_constant_val() const +{ + return constant_val(); +} + +isl::val aff::denominator_val() const +{ + auto res = isl_aff_get_denominator_val(get()); + return manage(res); +} + +isl::val aff::get_denominator_val() const +{ + return denominator_val(); +} + +class size aff::dim(isl::dim type) const +{ + return isl::multi_aff(*this).dim(type); +} + +isl::id aff::dim_id(isl::dim type, unsigned int pos) const +{ + return isl::pw_aff(*this).dim_id(type, pos); +} + +isl::aff aff::div(isl::aff aff2) const +{ + auto res = isl_aff_div(copy(), aff2.release()); + return manage(res); +} + +isl::pw_aff aff::div(const isl::pw_aff &pa2) const +{ + return isl::pw_aff(*this).div(pa2); +} + +isl::set aff::domain() const +{ + return isl::pw_aff(*this).domain(); +} + +isl::space aff::domain_space() const +{ + return isl::pw_aff(*this).domain_space(); +} + +isl::pw_multi_aff aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +{ + return isl::pw_aff(*this).drop_dims(type, first, n); +} + +isl::set aff::eq_set(isl::aff aff2) const +{ + auto res = isl_aff_eq_set(copy(), aff2.release()); + return manage(res); +} + +isl::set aff::eq_set(const isl::pw_aff &pwaff2) const +{ + return isl::pw_aff(*this).eq_set(pwaff2); +} + +isl::val aff::eval(isl::point pnt) const +{ + auto res = isl_aff_eval(copy(), pnt.release()); + return manage(res); +} + +isl::pw_multi_aff aff::extract_pw_multi_aff(const isl::space &space) const +{ + return isl::pw_aff(*this).extract_pw_multi_aff(space); +} + +isl::multi_aff aff::flat_range_product(const isl::multi_aff &multi2) const +{ + return isl::multi_aff(*this).flat_range_product(multi2); +} + +isl::multi_pw_aff aff::flat_range_product(const isl::multi_pw_aff &multi2) const +{ + return isl::pw_aff(*this).flat_range_product(multi2); +} + +isl::multi_union_pw_aff aff::flat_range_product(const isl::multi_union_pw_aff &multi2) const +{ + return isl::pw_aff(*this).flat_range_product(multi2); +} + +isl::pw_multi_aff aff::flat_range_product(const isl::pw_multi_aff &pma2) const +{ + return isl::pw_aff(*this).flat_range_product(pma2); +} + +isl::union_pw_multi_aff aff::flat_range_product(const isl::union_pw_multi_aff &upma2) const +{ + return isl::pw_aff(*this).flat_range_product(upma2); +} + +isl::aff aff::floor() const +{ + auto res = isl_aff_floor(copy()); + return manage(res); +} + +stat aff::foreach_piece(const std::function &fn) const +{ + return isl::pw_aff(*this).foreach_piece(fn); +} + +stat aff::foreach_piece(const std::function &fn) const +{ + return isl::pw_aff(*this).foreach_piece(fn); +} + +stat aff::foreach_pw_aff(const std::function &fn) const +{ + return isl::pw_aff(*this).foreach_pw_aff(fn); +} + +isl::set aff::ge_set(isl::aff aff2) const +{ + auto res = isl_aff_ge_set(copy(), aff2.release()); + return manage(res); +} + +isl::set aff::ge_set(const isl::pw_aff &pwaff2) const +{ + return isl::pw_aff(*this).ge_set(pwaff2); +} + +isl::aff aff::gist(isl::set context) const +{ + auto res = isl_aff_gist(copy(), context.release()); + return manage(res); +} + +isl::union_pw_aff aff::gist(const isl::union_set &context) const +{ + return isl::pw_aff(*this).gist(context); +} + +isl::aff aff::gist(const isl::basic_set &context) const +{ + return this->gist(isl::set(context)); +} + +isl::aff aff::gist(const isl::point &context) const +{ + return this->gist(isl::set(context)); +} + +isl::set aff::gt_set(isl::aff aff2) const +{ + auto res = isl_aff_gt_set(copy(), aff2.release()); + return manage(res); +} + +isl::set aff::gt_set(const isl::pw_aff &pwaff2) const +{ + return isl::pw_aff(*this).gt_set(pwaff2); +} + +boolean aff::has_range_tuple_id() const +{ + return isl::multi_aff(*this).has_range_tuple_id(); +} + +isl::multi_aff aff::identity() const +{ + return isl::multi_aff(*this).identity(); +} + +isl::pw_aff aff::insert_domain(const isl::space &domain) const +{ + return isl::pw_aff(*this).insert_domain(domain); +} + +isl::pw_aff aff::intersect_domain(const isl::set &set) const +{ + return isl::pw_aff(*this).intersect_domain(set); +} + +isl::union_pw_aff aff::intersect_domain(const isl::space &space) const +{ + return isl::pw_aff(*this).intersect_domain(space); +} + +isl::union_pw_aff aff::intersect_domain(const isl::union_set &uset) const +{ + return isl::pw_aff(*this).intersect_domain(uset); +} + +isl::union_pw_aff aff::intersect_domain_wrapped_domain(const isl::union_set &uset) const +{ + return isl::pw_aff(*this).intersect_domain_wrapped_domain(uset); +} + +isl::union_pw_aff aff::intersect_domain_wrapped_range(const isl::union_set &uset) const +{ + return isl::pw_aff(*this).intersect_domain_wrapped_range(uset); +} + +isl::pw_aff aff::intersect_params(const isl::set &set) const +{ + return isl::pw_aff(*this).intersect_params(set); +} + +boolean aff::involves_locals() const +{ + return isl::multi_aff(*this).involves_locals(); +} + +boolean aff::involves_nan() const +{ + return isl::multi_aff(*this).involves_nan(); +} + +boolean aff::involves_param(const isl::id &id) const +{ + return isl::pw_aff(*this).involves_param(id); +} + +boolean aff::involves_param(const std::string &id) const +{ + return this->involves_param(isl::id(ctx(), id)); +} + +boolean aff::involves_param(const isl::id_list &list) const +{ + return isl::pw_aff(*this).involves_param(list); +} + +boolean aff::is_cst() const +{ + auto res = isl_aff_is_cst(get()); + return manage(res); +} + +boolean aff::is_equal(const isl::pw_aff &pa2) const +{ + return isl::pw_aff(*this).is_equal(pa2); +} + +boolean aff::isa_aff() const +{ + return isl::pw_aff(*this).isa_aff(); +} + +boolean aff::isa_multi_aff() const +{ + return isl::pw_aff(*this).isa_multi_aff(); +} + +boolean aff::isa_pw_multi_aff() const +{ + return isl::pw_aff(*this).isa_pw_multi_aff(); +} + +isl::set aff::le_set(isl::aff aff2) const +{ + auto res = isl_aff_le_set(copy(), aff2.release()); + return manage(res); +} + +isl::set aff::le_set(const isl::pw_aff &pwaff2) const +{ + return isl::pw_aff(*this).le_set(pwaff2); +} + +isl::aff_list aff::list() const +{ + return isl::multi_aff(*this).list(); +} + +isl::set aff::lt_set(isl::aff aff2) const +{ + auto res = isl_aff_lt_set(copy(), aff2.release()); + return manage(res); +} + +isl::set aff::lt_set(const isl::pw_aff &pwaff2) const +{ + return isl::pw_aff(*this).lt_set(pwaff2); +} + +isl::multi_pw_aff aff::max(const isl::multi_pw_aff &multi2) const +{ + return isl::pw_aff(*this).max(multi2); +} + +isl::pw_aff aff::max(const isl::pw_aff &pwaff2) const +{ + return isl::pw_aff(*this).max(pwaff2); +} + +isl::multi_val aff::max_multi_val() const +{ + return isl::pw_aff(*this).max_multi_val(); +} + +isl::multi_pw_aff aff::min(const isl::multi_pw_aff &multi2) const +{ + return isl::pw_aff(*this).min(multi2); +} + +isl::pw_aff aff::min(const isl::pw_aff &pwaff2) const +{ + return isl::pw_aff(*this).min(pwaff2); +} + +isl::multi_val aff::min_multi_val() const +{ + return isl::pw_aff(*this).min_multi_val(); +} + +isl::aff aff::mod(isl::val mod) const +{ + auto res = isl_aff_mod_val(copy(), mod.release()); + return manage(res); +} + +isl::aff aff::mod(long mod) const +{ + return this->mod(isl::val(ctx(), mod)); +} + +isl::aff aff::mul(isl::aff aff2) const +{ + auto res = isl_aff_mul(copy(), aff2.release()); + return manage(res); +} + +isl::pw_aff aff::mul(const isl::pw_aff &pwaff2) const +{ + return isl::pw_aff(*this).mul(pwaff2); +} + +class size aff::n_piece() const +{ + return isl::pw_aff(*this).n_piece(); +} + +isl::set aff::ne_set(isl::aff aff2) const +{ + auto res = isl_aff_ne_set(copy(), aff2.release()); + return manage(res); +} + +isl::set aff::ne_set(const isl::pw_aff &pwaff2) const +{ + return isl::pw_aff(*this).ne_set(pwaff2); +} + +isl::aff aff::neg() const +{ + auto res = isl_aff_neg(copy()); + return manage(res); +} + +boolean aff::plain_is_empty() const +{ + return isl::pw_aff(*this).plain_is_empty(); +} + +boolean aff::plain_is_equal(const isl::multi_aff &multi2) const +{ + return isl::multi_aff(*this).plain_is_equal(multi2); +} + +boolean aff::plain_is_equal(const isl::multi_pw_aff &multi2) const +{ + return isl::pw_aff(*this).plain_is_equal(multi2); +} + +boolean aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const +{ + return isl::pw_aff(*this).plain_is_equal(multi2); +} + +isl::pw_multi_aff aff::preimage_domain_wrapped_domain(const isl::pw_multi_aff &pma2) const +{ + return isl::pw_aff(*this).preimage_domain_wrapped_domain(pma2); +} + +isl::union_pw_multi_aff aff::preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const +{ + return isl::pw_aff(*this).preimage_domain_wrapped_domain(upma2); +} + +isl::multi_aff aff::product(const isl::multi_aff &multi2) const +{ + return isl::multi_aff(*this).product(multi2); +} + +isl::multi_pw_aff aff::product(const isl::multi_pw_aff &multi2) const +{ + return isl::pw_aff(*this).product(multi2); +} + +isl::pw_multi_aff aff::product(const isl::pw_multi_aff &pma2) const +{ + return isl::pw_aff(*this).product(pma2); +} + +isl::aff aff::pullback(isl::multi_aff ma) const +{ + auto res = isl_aff_pullback_multi_aff(copy(), ma.release()); + return manage(res); +} + +isl::pw_aff aff::pullback(const isl::multi_pw_aff &mpa) const +{ + return isl::pw_aff(*this).pullback(mpa); +} + +isl::pw_aff aff::pullback(const isl::pw_multi_aff &pma) const +{ + return isl::pw_aff(*this).pullback(pma); +} + +isl::union_pw_aff aff::pullback(const isl::union_pw_multi_aff &upma) const +{ + return isl::pw_aff(*this).pullback(upma); +} + +isl::aff aff::pullback(const isl::aff &ma) const +{ + return this->pullback(isl::multi_aff(ma)); +} + +isl::pw_multi_aff_list aff::pw_multi_aff_list() const +{ + return isl::pw_aff(*this).pw_multi_aff_list(); +} + +isl::pw_multi_aff aff::range_factor_domain() const +{ + return isl::pw_aff(*this).range_factor_domain(); +} + +isl::pw_multi_aff aff::range_factor_range() const +{ + return isl::pw_aff(*this).range_factor_range(); +} + +isl::multi_aff aff::range_product(const isl::multi_aff &multi2) const +{ + return isl::multi_aff(*this).range_product(multi2); +} + +isl::multi_pw_aff aff::range_product(const isl::multi_pw_aff &multi2) const +{ + return isl::pw_aff(*this).range_product(multi2); +} + +isl::multi_union_pw_aff aff::range_product(const isl::multi_union_pw_aff &multi2) const +{ + return isl::pw_aff(*this).range_product(multi2); +} + +isl::pw_multi_aff aff::range_product(const isl::pw_multi_aff &pma2) const +{ + return isl::pw_aff(*this).range_product(pma2); +} + +isl::union_pw_multi_aff aff::range_product(const isl::union_pw_multi_aff &upma2) const +{ + return isl::pw_aff(*this).range_product(upma2); +} + +isl::id aff::range_tuple_id() const +{ + return isl::multi_aff(*this).range_tuple_id(); +} + +isl::multi_aff aff::reset_range_tuple_id() const +{ + return isl::multi_aff(*this).reset_range_tuple_id(); +} + +isl::multi_aff aff::reset_tuple_id(isl::dim type) const +{ + return isl::multi_aff(*this).reset_tuple_id(type); +} + +isl::aff aff::scale(isl::val v) const +{ + auto res = isl_aff_scale_val(copy(), v.release()); + return manage(res); +} + +isl::aff aff::scale(long v) const +{ + return this->scale(isl::val(ctx(), v)); +} + +isl::multi_aff aff::scale(const isl::multi_val &mv) const +{ + return isl::multi_aff(*this).scale(mv); +} + +isl::aff aff::scale_down(isl::val v) const +{ + auto res = isl_aff_scale_down_val(copy(), v.release()); + return manage(res); +} + +isl::aff aff::scale_down(long v) const +{ + return this->scale_down(isl::val(ctx(), v)); +} + +isl::multi_aff aff::scale_down(const isl::multi_val &mv) const +{ + return isl::multi_aff(*this).scale_down(mv); +} + +isl::multi_aff aff::set_aff(int pos, const isl::aff &el) const +{ + return isl::multi_aff(*this).set_aff(pos, el); +} + +isl::multi_aff aff::set_at(int pos, const isl::aff &el) const +{ + return isl::multi_aff(*this).set_at(pos, el); +} + +isl::multi_pw_aff aff::set_at(int pos, const isl::pw_aff &el) const +{ + return isl::pw_aff(*this).set_at(pos, el); +} + +isl::multi_union_pw_aff aff::set_at(int pos, const isl::union_pw_aff &el) const +{ + return isl::pw_aff(*this).set_at(pos, el); +} + +isl::aff aff::set_constant_si(int v) const +{ + auto res = isl_aff_set_constant_si(copy(), v); + return manage(res); +} + +isl::multi_pw_aff aff::set_pw_aff(int pos, const isl::pw_aff &el) const +{ + return isl::pw_aff(*this).set_pw_aff(pos, el); +} + +isl::pw_multi_aff aff::set_pw_aff(unsigned int pos, const isl::pw_aff &pa) const +{ + return isl::pw_aff(*this).set_pw_aff(pos, pa); +} + +isl::multi_aff aff::set_range_tuple(const isl::id &id) const +{ + return isl::multi_aff(*this).set_range_tuple(id); +} + +isl::multi_aff aff::set_range_tuple(const std::string &id) const +{ + return this->set_range_tuple(isl::id(ctx(), id)); +} + +isl::pw_aff aff::set_tuple_id(isl::dim type, const isl::id &id) const +{ + return isl::pw_aff(*this).set_tuple_id(type, id); +} + +isl::pw_aff aff::set_tuple_id(isl::dim type, const std::string &id) const +{ + return this->set_tuple_id(type, isl::id(ctx(), id)); +} + +isl::multi_union_pw_aff aff::set_union_pw_aff(int pos, const isl::union_pw_aff &el) const +{ + return isl::pw_aff(*this).set_union_pw_aff(pos, el); +} + +class size aff::size() const +{ + return isl::multi_aff(*this).size(); +} + +isl::space aff::space() const +{ + return isl::pw_aff(*this).space(); +} + +isl::aff aff::sub(isl::aff aff2) const +{ + auto res = isl_aff_sub(copy(), aff2.release()); + return manage(res); +} + +isl::multi_aff aff::sub(const isl::multi_aff &multi2) const +{ + return isl::multi_aff(*this).sub(multi2); +} + +isl::multi_pw_aff aff::sub(const isl::multi_pw_aff &multi2) const +{ + return isl::pw_aff(*this).sub(multi2); +} + +isl::multi_union_pw_aff aff::sub(const isl::multi_union_pw_aff &multi2) const +{ + return isl::pw_aff(*this).sub(multi2); +} + +isl::pw_aff aff::sub(const isl::pw_aff &pwaff2) const +{ + return isl::pw_aff(*this).sub(pwaff2); +} + +isl::pw_multi_aff aff::sub(const isl::pw_multi_aff &pma2) const +{ + return isl::pw_aff(*this).sub(pma2); +} + +isl::union_pw_aff aff::sub(const isl::union_pw_aff &upa2) const +{ + return isl::pw_aff(*this).sub(upa2); +} + +isl::union_pw_multi_aff aff::sub(const isl::union_pw_multi_aff &upma2) const +{ + return isl::pw_aff(*this).sub(upma2); +} + +isl::pw_aff aff::subtract_domain(const isl::set &set) const +{ + return isl::pw_aff(*this).subtract_domain(set); +} + +isl::union_pw_aff aff::subtract_domain(const isl::space &space) const +{ + return isl::pw_aff(*this).subtract_domain(space); +} + +isl::union_pw_aff aff::subtract_domain(const isl::union_set &uset) const +{ + return isl::pw_aff(*this).subtract_domain(uset); +} + +isl::pw_aff aff::tdiv_q(const isl::pw_aff &pa2) const +{ + return isl::pw_aff(*this).tdiv_q(pa2); +} + +isl::pw_aff aff::tdiv_r(const isl::pw_aff &pa2) const +{ + return isl::pw_aff(*this).tdiv_r(pa2); +} + +isl::aff_list aff::to_list() const +{ + auto res = isl_aff_to_list(copy()); + return manage(res); +} + +isl::multi_pw_aff aff::to_multi_pw_aff() const +{ + return isl::multi_aff(*this).to_multi_pw_aff(); +} + +isl::multi_union_pw_aff aff::to_multi_union_pw_aff() const +{ + return isl::multi_aff(*this).to_multi_union_pw_aff(); +} + +isl::pw_multi_aff aff::to_pw_multi_aff() const +{ + return isl::multi_aff(*this).to_pw_multi_aff(); +} + +isl::union_pw_aff aff::to_union_pw_aff() const +{ + return isl::pw_aff(*this).to_union_pw_aff(); +} + +isl::union_pw_multi_aff aff::to_union_pw_multi_aff() const +{ + return isl::pw_aff(*this).to_union_pw_multi_aff(); +} + +isl::id aff::tuple_id(isl::dim type) const +{ + return isl::pw_aff(*this).tuple_id(type); +} + +isl::aff aff::unbind_params_insert_domain(isl::multi_id domain) const +{ + auto res = isl_aff_unbind_params_insert_domain(copy(), domain.release()); + return manage(res); +} + +isl::multi_pw_aff aff::union_add(const isl::multi_pw_aff &mpa2) const +{ + return isl::pw_aff(*this).union_add(mpa2); +} + +isl::multi_union_pw_aff aff::union_add(const isl::multi_union_pw_aff &mupa2) const +{ + return isl::pw_aff(*this).union_add(mupa2); +} + +isl::pw_aff aff::union_add(const isl::pw_aff &pwaff2) const +{ + return isl::pw_aff(*this).union_add(pwaff2); +} + +isl::pw_multi_aff aff::union_add(const isl::pw_multi_aff &pma2) const +{ + return isl::pw_aff(*this).union_add(pma2); +} + +isl::union_pw_aff aff::union_add(const isl::union_pw_aff &upa2) const +{ + return isl::pw_aff(*this).union_add(upa2); +} + +isl::union_pw_multi_aff aff::union_add(const isl::union_pw_multi_aff &upma2) const +{ + return isl::pw_aff(*this).union_add(upma2); +} + +isl::aff aff::var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos) +{ + auto res = isl_aff_var_on_domain(ls.release(), static_cast(type), pos); + return manage(res); +} + +isl::aff aff::zero_on_domain(isl::space space) +{ + auto res = isl_aff_zero_on_domain_space(space.release()); + return manage(res); +} + +inline std::ostream &operator<<(std::ostream &os, const aff &obj) +{ + char *str = isl_aff_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} + +// implementations for isl::aff_list +aff_list manage(__isl_take isl_aff_list *ptr) { + return aff_list(ptr); +} +aff_list manage_copy(__isl_keep isl_aff_list *ptr) { + ptr = isl_aff_list_copy(ptr); + return aff_list(ptr); +} + +aff_list::aff_list() + : ptr(nullptr) {} + +aff_list::aff_list(const aff_list &obj) + : ptr(nullptr) +{ + ptr = obj.copy(); +} + +aff_list::aff_list(__isl_take isl_aff_list *ptr) + : ptr(ptr) {} + +aff_list::aff_list(isl::ctx ctx, int n) +{ + auto res = isl_aff_list_alloc(ctx.release(), n); + ptr = res; +} + +aff_list::aff_list(isl::aff el) +{ + auto res = isl_aff_list_from_aff(el.release()); + ptr = res; +} + +aff_list::aff_list(isl::ctx ctx, const std::string &str) +{ + auto res = isl_aff_list_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} + +aff_list &aff_list::operator=(aff_list obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +aff_list::~aff_list() { + if (ptr) + isl_aff_list_free(ptr); +} + +__isl_give isl_aff_list *aff_list::copy() const & { + return isl_aff_list_copy(ptr); +} + +__isl_keep isl_aff_list *aff_list::get() const { + return ptr; +} + +__isl_give isl_aff_list *aff_list::release() { + isl_aff_list *tmp = ptr; + ptr = nullptr; + return tmp; +} + +bool aff_list::is_null() const { + return ptr == nullptr; +} + +isl::ctx aff_list::ctx() const { + return isl::ctx(isl_aff_list_get_ctx(ptr)); +} + +isl::aff_list aff_list::add(isl::aff el) const +{ + auto res = isl_aff_list_add(copy(), el.release()); + return manage(res); +} + +isl::aff aff_list::at(int index) const +{ + auto res = isl_aff_list_get_at(get(), index); + return manage(res); +} + +isl::aff aff_list::get_at(int index) const +{ + return at(index); +} + +isl::aff_list aff_list::clear() const +{ + auto res = isl_aff_list_clear(copy()); + return manage(res); +} + +isl::aff_list aff_list::concat(isl::aff_list list2) const +{ + auto res = isl_aff_list_concat(copy(), list2.release()); + return manage(res); +} + +isl::aff_list aff_list::drop(unsigned int first, unsigned int n) const +{ + auto res = isl_aff_list_drop(copy(), first, n); + return manage(res); +} + +stat aff_list::foreach(const std::function &fn) const +{ + struct fn_data { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_aff *arg_0, void *arg_1) -> isl_stat { + auto *data = static_cast(arg_1); + auto ret = (data->func)(manage(arg_0)); + return ret.release(); + }; + auto res = isl_aff_list_foreach(get(), fn_lambda, &fn_data); + return manage(res); +} -class term { - friend inline term manage(__isl_take isl_term *ptr); - friend inline term manage_copy(__isl_keep isl_term *ptr); +isl::aff_list aff_list::insert(unsigned int pos, isl::aff el) const +{ + auto res = isl_aff_list_insert(copy(), pos, el.release()); + return manage(res); +} - isl_term *ptr = nullptr; +class size aff_list::size() const +{ + auto res = isl_aff_list_size(get()); + return manage(res); +} - inline explicit term(__isl_take isl_term *ptr); +inline std::ostream &operator<<(std::ostream &os, const aff_list &obj) +{ + char *str = isl_aff_list_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} -public: - inline /* implicit */ term(); - inline /* implicit */ term(const term &obj); - inline term &operator=(term obj); - inline ~term(); - inline __isl_give isl_term *copy() const &; - inline __isl_give isl_term *copy() && = delete; - inline __isl_keep isl_term *get() const; - inline __isl_give isl_term *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; +// implementations for isl::ast_build +ast_build manage(__isl_take isl_ast_build *ptr) { + return ast_build(ptr); +} +ast_build manage_copy(__isl_keep isl_ast_build *ptr) { + ptr = isl_ast_build_copy(ptr); + return ast_build(ptr); +} - inline isl_size dim(isl::dim type) const; - inline isl::val get_coefficient_val() const; - inline isl::aff get_div(unsigned int pos) const; - inline isl_size get_exp(isl::dim type, unsigned int pos) const; -}; +ast_build::ast_build() + : ptr(nullptr) {} -// declarations for isl::union_access_info -inline union_access_info manage(__isl_take isl_union_access_info *ptr); -inline union_access_info manage_copy(__isl_keep isl_union_access_info *ptr); +ast_build::ast_build(const ast_build &obj) + : ptr(nullptr) +{ + ptr = obj.copy(); + copy_callbacks(obj); +} -class union_access_info { - friend inline union_access_info manage(__isl_take isl_union_access_info *ptr); - friend inline union_access_info manage_copy(__isl_keep isl_union_access_info *ptr); +ast_build::ast_build(__isl_take isl_ast_build *ptr) + : ptr(ptr) {} - isl_union_access_info *ptr = nullptr; +ast_build::ast_build(isl::ctx ctx) +{ + auto res = isl_ast_build_alloc(ctx.release()); + ptr = res; +} - inline explicit union_access_info(__isl_take isl_union_access_info *ptr); +ast_build &ast_build::operator=(ast_build obj) { + std::swap(this->ptr, obj.ptr); + copy_callbacks(obj); + return *this; +} -public: - inline /* implicit */ union_access_info(); - inline /* implicit */ union_access_info(const union_access_info &obj); - inline explicit union_access_info(isl::union_map sink); - inline union_access_info &operator=(union_access_info obj); - inline ~union_access_info(); - inline __isl_give isl_union_access_info *copy() const &; - inline __isl_give isl_union_access_info *copy() && = delete; - inline __isl_keep isl_union_access_info *get() const; - inline __isl_give isl_union_access_info *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; +ast_build::~ast_build() { + if (ptr) + isl_ast_build_free(ptr); +} - inline isl::union_flow compute_flow() const; - inline isl::union_access_info set_kill(isl::union_map kill) const; - inline isl::union_access_info set_may_source(isl::union_map may_source) const; - inline isl::union_access_info set_must_source(isl::union_map must_source) const; - inline isl::union_access_info set_schedule(isl::schedule schedule) const; - inline isl::union_access_info set_schedule_map(isl::union_map schedule_map) const; -}; +__isl_give isl_ast_build *ast_build::copy() const & { + return isl_ast_build_copy(ptr); +} -// declarations for isl::union_flow -inline union_flow manage(__isl_take isl_union_flow *ptr); -inline union_flow manage_copy(__isl_keep isl_union_flow *ptr); +__isl_keep isl_ast_build *ast_build::get() const { + return ptr; +} -class union_flow { - friend inline union_flow manage(__isl_take isl_union_flow *ptr); - friend inline union_flow manage_copy(__isl_keep isl_union_flow *ptr); +__isl_give isl_ast_build *ast_build::release() { + if (at_each_domain_data) + isl_die(ctx().get(), isl_error_invalid, "cannot release object with persistent callbacks", return nullptr); + isl_ast_build *tmp = ptr; + ptr = nullptr; + return tmp; +} - isl_union_flow *ptr = nullptr; +bool ast_build::is_null() const { + return ptr == nullptr; +} - inline explicit union_flow(__isl_take isl_union_flow *ptr); +isl::ctx ast_build::ctx() const { + return isl::ctx(isl_ast_build_get_ctx(ptr)); +} -public: - inline /* implicit */ union_flow(); - inline /* implicit */ union_flow(const union_flow &obj); - inline union_flow &operator=(union_flow obj); - inline ~union_flow(); - inline __isl_give isl_union_flow *copy() const &; - inline __isl_give isl_union_flow *copy() && = delete; - inline __isl_keep isl_union_flow *get() const; - inline __isl_give isl_union_flow *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; +ast_build &ast_build::copy_callbacks(const ast_build &obj) +{ + at_each_domain_data = obj.at_each_domain_data; + return *this; +} - inline isl::union_map get_full_may_dependence() const; - inline isl::union_map get_full_must_dependence() const; - inline isl::union_map get_may_dependence() const; - inline isl::union_map get_may_no_source() const; - inline isl::union_map get_must_dependence() const; - inline isl::union_map get_must_no_source() const; -}; +isl_ast_node *ast_build::at_each_domain(isl_ast_node *arg_0, isl_ast_build *arg_1, void *arg_2) +{ + auto *data = static_cast(arg_2); + auto ret = (data->func)(manage(arg_0), manage_copy(arg_1)); + return ret.release(); +} -// declarations for isl::union_map -inline union_map manage(__isl_take isl_union_map *ptr); -inline union_map manage_copy(__isl_keep isl_union_map *ptr); +void ast_build::set_at_each_domain_data(const std::function &fn) +{ + at_each_domain_data = std::make_shared(); + at_each_domain_data->func = fn; + ptr = isl_ast_build_set_at_each_domain(ptr, &at_each_domain, at_each_domain_data.get()); +} -class union_map { - friend inline union_map manage(__isl_take isl_union_map *ptr); - friend inline union_map manage_copy(__isl_keep isl_union_map *ptr); +isl::ast_build ast_build::set_at_each_domain(const std::function &fn) const +{ + auto copy = *this; + copy.set_at_each_domain_data(fn); + return copy; +} - isl_union_map *ptr = nullptr; +isl::ast_expr ast_build::access_from(isl::multi_pw_aff mpa) const +{ + auto res = isl_ast_build_access_from_multi_pw_aff(get(), mpa.release()); + return manage(res); +} - inline explicit union_map(__isl_take isl_union_map *ptr); +isl::ast_expr ast_build::access_from(isl::pw_multi_aff pma) const +{ + auto res = isl_ast_build_access_from_pw_multi_aff(get(), pma.release()); + return manage(res); +} -public: - inline /* implicit */ union_map(); - inline /* implicit */ union_map(const union_map &obj); - inline /* implicit */ union_map(isl::basic_map bmap); - inline /* implicit */ union_map(isl::map map); - inline explicit union_map(isl::union_pw_multi_aff upma); - inline explicit union_map(isl::ctx ctx, const std::string &str); - inline union_map &operator=(union_map obj); - inline ~union_map(); - inline __isl_give isl_union_map *copy() const &; - inline __isl_give isl_union_map *copy() && = delete; - inline __isl_keep isl_union_map *get() const; - inline __isl_give isl_union_map *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; +isl::ast_expr ast_build::call_from(isl::multi_pw_aff mpa) const +{ + auto res = isl_ast_build_call_from_multi_pw_aff(get(), mpa.release()); + return manage(res); +} - inline isl::union_map affine_hull() const; - inline isl::union_map align_params(isl::space model) const; - inline isl::union_map apply_domain(isl::union_map umap2) const; - inline isl::union_map apply_range(isl::union_map umap2) const; - inline isl::union_set bind_range(isl::multi_id tuple) const; - inline isl::union_map coalesce() const; - inline boolean contains(const isl::space &space) const; - inline isl::union_map curry() const; - inline isl::union_set deltas() const; - inline isl::union_map deltas_map() const; - inline isl::union_map detect_equalities() const; - inline isl_size dim(isl::dim type) const; - inline isl::union_set domain() const; - inline isl::union_map domain_factor_domain() const; - inline isl::union_map domain_factor_range() const; - inline isl::union_map domain_map() const; - inline isl::union_pw_multi_aff domain_map_union_pw_multi_aff() const; - inline isl::union_map domain_product(isl::union_map umap2) const; - static inline isl::union_map empty(isl::ctx ctx); - inline isl::union_map eq_at(isl::multi_union_pw_aff mupa) const; - inline isl::map extract_map(isl::space space) const; - inline isl::union_map factor_domain() const; - inline isl::union_map factor_range() const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::union_map fixed_power(isl::val exp) const; - inline isl::union_map flat_domain_product(isl::union_map umap2) const; - inline isl::union_map flat_range_product(isl::union_map umap2) const; - inline stat foreach_map(const std::function &fn) const; - static inline isl::union_map from(isl::multi_union_pw_aff mupa); - static inline isl::union_map from_domain(isl::union_set uset); - static inline isl::union_map from_domain_and_range(isl::union_set domain, isl::union_set range); - static inline isl::union_map from_range(isl::union_set uset); - static inline isl::union_map from_union_pw_aff(isl::union_pw_aff upa); - inline isl::id get_dim_id(isl::dim type, unsigned int pos) const; - inline uint32_t get_hash() const; - inline isl::map_list get_map_list() const; - inline isl::space get_space() const; - inline isl::union_map gist(isl::union_map context) const; - inline isl::union_map gist_domain(isl::union_set uset) const; - inline isl::union_map gist_params(isl::set set) const; - inline isl::union_map gist_range(isl::union_set uset) const; - inline isl::union_map intersect(isl::union_map umap2) const; - inline isl::union_map intersect_domain(isl::space space) const; - inline isl::union_map intersect_domain(isl::union_set uset) const; - inline isl::union_map intersect_domain_factor_domain(isl::union_map factor) const; - inline isl::union_map intersect_domain_factor_range(isl::union_map factor) const; - inline isl::union_map intersect_params(isl::set set) const; - inline isl::union_map intersect_range(isl::space space) const; - inline isl::union_map intersect_range(isl::union_set uset) const; - inline isl::union_map intersect_range_factor_domain(isl::union_map factor) const; - inline isl::union_map intersect_range_factor_range(isl::union_map factor) const; - inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline boolean is_bijective() const; - inline boolean is_disjoint(const isl::union_map &umap2) const; - inline boolean is_empty() const; - inline boolean is_equal(const isl::union_map &umap2) const; - inline boolean is_identity() const; - inline boolean is_injective() const; - inline boolean is_single_valued() const; - inline boolean is_strict_subset(const isl::union_map &umap2) const; - inline boolean is_subset(const isl::union_map &umap2) const; - inline boolean isa_map() const; - inline isl::union_map lex_ge_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const; - inline isl::union_map lex_ge_union_map(isl::union_map umap2) const; - inline isl::union_map lex_gt_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const; - inline isl::union_map lex_gt_union_map(isl::union_map umap2) const; - inline isl::union_map lex_le_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const; - inline isl::union_map lex_le_union_map(isl::union_map umap2) const; - inline isl::union_map lex_lt_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const; - inline isl::union_map lex_lt_union_map(isl::union_map umap2) const; - inline isl::union_map lexmax() const; - inline isl::union_map lexmin() const; - inline isl_size n_map() const; - inline isl::set params() const; - inline boolean plain_is_empty() const; - inline boolean plain_is_injective() const; - inline isl::union_map polyhedral_hull() const; - inline isl::union_map preimage_domain(isl::multi_aff ma) const; - inline isl::union_map preimage_domain(isl::multi_pw_aff mpa) const; - inline isl::union_map preimage_domain(isl::pw_multi_aff pma) const; - inline isl::union_map preimage_domain(isl::union_pw_multi_aff upma) const; - inline isl::union_map preimage_range(isl::multi_aff ma) const; - inline isl::union_map preimage_range(isl::pw_multi_aff pma) const; - inline isl::union_map preimage_range(isl::union_pw_multi_aff upma) const; - inline isl::union_map product(isl::union_map umap2) const; - inline isl::union_map project_out(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::union_map project_out_all_params() const; - inline isl::union_set range() const; - inline isl::union_map range_curry() const; - inline isl::union_map range_factor_domain() const; - inline isl::union_map range_factor_range() const; - inline isl::union_map range_map() const; - inline isl::union_map range_product(isl::union_map umap2) const; - inline isl::union_map range_reverse() const; - inline isl::union_map remove_divs() const; - inline isl::union_map remove_redundancies() const; - inline isl::union_map reset_user() const; - inline isl::union_map reverse() const; - inline isl::basic_map sample() const; - inline isl::union_map simple_hull() const; - inline isl::union_map subtract(isl::union_map umap2) const; - inline isl::union_map subtract_domain(isl::union_set dom) const; - inline isl::union_map subtract_range(isl::union_set dom) const; - inline isl::union_map uncurry() const; - inline isl::union_map unite(isl::union_map umap2) const; - inline isl::union_map universe() const; - inline isl::union_set wrap() const; - inline isl::union_map zip() const; -}; +isl::ast_expr ast_build::call_from(isl::pw_multi_aff pma) const +{ + auto res = isl_ast_build_call_from_pw_multi_aff(get(), pma.release()); + return manage(res); +} -// declarations for isl::union_map_list -inline union_map_list manage(__isl_take isl_union_map_list *ptr); -inline union_map_list manage_copy(__isl_keep isl_union_map_list *ptr); +isl::ast_expr ast_build::expr_from(isl::pw_aff pa) const +{ + auto res = isl_ast_build_expr_from_pw_aff(get(), pa.release()); + return manage(res); +} -class union_map_list { - friend inline union_map_list manage(__isl_take isl_union_map_list *ptr); - friend inline union_map_list manage_copy(__isl_keep isl_union_map_list *ptr); +isl::ast_expr ast_build::expr_from(isl::set set) const +{ + auto res = isl_ast_build_expr_from_set(get(), set.release()); + return manage(res); +} - isl_union_map_list *ptr = nullptr; +isl::ast_build ast_build::from_context(isl::set set) +{ + auto res = isl_ast_build_from_context(set.release()); + return manage(res); +} - inline explicit union_map_list(__isl_take isl_union_map_list *ptr); +isl::ast_node ast_build::node_from(isl::schedule schedule) const +{ + auto res = isl_ast_build_node_from_schedule(get(), schedule.release()); + return manage(res); +} -public: - inline /* implicit */ union_map_list(); - inline /* implicit */ union_map_list(const union_map_list &obj); - inline union_map_list &operator=(union_map_list obj); - inline ~union_map_list(); - inline __isl_give isl_union_map_list *copy() const &; - inline __isl_give isl_union_map_list *copy() && = delete; - inline __isl_keep isl_union_map_list *get() const; - inline __isl_give isl_union_map_list *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::union_map_list add(isl::union_map el) const; - static inline isl::union_map_list alloc(isl::ctx ctx, int n); - inline isl::union_map_list clear() const; - inline isl::union_map_list concat(isl::union_map_list list2) const; - inline isl::union_map_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::union_map_list from_union_map(isl::union_map el); - inline isl::union_map get_at(int index) const; - inline isl::union_map get_union_map(int index) const; - inline isl::union_map_list insert(unsigned int pos, isl::union_map el) const; - inline isl_size n_union_map() const; - inline isl::union_map_list reverse() const; - inline isl::union_map_list set_union_map(int index, isl::union_map el) const; - inline isl_size size() const; - inline isl::union_map_list swap(unsigned int pos1, unsigned int pos2) const; -}; +isl::ast_node ast_build::node_from_schedule_map(isl::union_map schedule) const +{ + auto res = isl_ast_build_node_from_schedule_map(get(), schedule.release()); + return manage(res); +} -// declarations for isl::union_pw_aff -inline union_pw_aff manage(__isl_take isl_union_pw_aff *ptr); -inline union_pw_aff manage_copy(__isl_keep isl_union_pw_aff *ptr); +isl::ast_build ast_build::restrict(isl::set set) const +{ + auto res = isl_ast_build_restrict(copy(), set.release()); + return manage(res).copy_callbacks(*this); +} -class union_pw_aff { - friend inline union_pw_aff manage(__isl_take isl_union_pw_aff *ptr); - friend inline union_pw_aff manage_copy(__isl_keep isl_union_pw_aff *ptr); +isl::union_map ast_build::schedule() const +{ + auto res = isl_ast_build_get_schedule(get()); + return manage(res); +} - isl_union_pw_aff *ptr = nullptr; +isl::union_map ast_build::get_schedule() const +{ + return schedule(); +} - inline explicit union_pw_aff(__isl_take isl_union_pw_aff *ptr); +// implementations for isl::ast_expr +ast_expr manage(__isl_take isl_ast_expr *ptr) { + return ast_expr(ptr); +} +ast_expr manage_copy(__isl_keep isl_ast_expr *ptr) { + ptr = isl_ast_expr_copy(ptr); + return ast_expr(ptr); +} -public: - inline /* implicit */ union_pw_aff(); - inline /* implicit */ union_pw_aff(const union_pw_aff &obj); - inline /* implicit */ union_pw_aff(isl::aff aff); - inline /* implicit */ union_pw_aff(isl::pw_aff pa); - inline explicit union_pw_aff(isl::ctx ctx, const std::string &str); - inline explicit union_pw_aff(isl::union_set domain, isl::val v); - inline union_pw_aff &operator=(union_pw_aff obj); - inline ~union_pw_aff(); - inline __isl_give isl_union_pw_aff *copy() const &; - inline __isl_give isl_union_pw_aff *copy() && = delete; - inline __isl_keep isl_union_pw_aff *get() const; - inline __isl_give isl_union_pw_aff *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; +ast_expr::ast_expr() + : ptr(nullptr) {} - inline isl::union_pw_aff add(isl::union_pw_aff upa2) const; - inline isl::union_pw_aff add_pw_aff(isl::pw_aff pa) const; - static inline isl::union_pw_aff aff_on_domain(isl::union_set domain, isl::aff aff); - inline isl::union_pw_aff align_params(isl::space model) const; - inline isl::union_set bind(isl::id id) const; - inline isl::union_pw_aff coalesce() const; - inline isl_size dim(isl::dim type) const; - inline isl::union_set domain() const; - inline isl::union_pw_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - static inline isl::union_pw_aff empty(isl::space space); - static inline isl::union_pw_aff empty_ctx(isl::ctx ctx); - static inline isl::union_pw_aff empty_space(isl::space space); - inline isl::pw_aff extract_pw_aff(isl::space space) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::union_pw_aff floor() const; - inline stat foreach_pw_aff(const std::function &fn) const; - inline isl::pw_aff_list get_pw_aff_list() const; - inline isl::space get_space() const; - inline isl::union_pw_aff gist(isl::union_set context) const; - inline isl::union_pw_aff gist_params(isl::set context) const; - inline isl::union_pw_aff intersect_domain(isl::space space) const; - inline isl::union_pw_aff intersect_domain(isl::union_set uset) const; - inline isl::union_pw_aff intersect_domain_wrapped_domain(isl::union_set uset) const; - inline isl::union_pw_aff intersect_domain_wrapped_range(isl::union_set uset) const; - inline isl::union_pw_aff intersect_params(isl::set set) const; - inline boolean involves_nan() const; - inline isl::val max_val() const; - inline isl::val min_val() const; - inline isl::union_pw_aff mod_val(isl::val f) const; - inline isl_size n_pw_aff() const; - inline isl::union_pw_aff neg() const; - static inline isl::union_pw_aff param_on_domain_id(isl::union_set domain, isl::id id); - inline boolean plain_is_equal(const isl::union_pw_aff &upa2) const; - inline isl::union_pw_aff pullback(isl::union_pw_multi_aff upma) const; - static inline isl::union_pw_aff pw_aff_on_domain(isl::union_set domain, isl::pw_aff pa); - inline isl::union_pw_aff reset_user() const; - inline isl::union_pw_aff scale_down_val(isl::val v) const; - inline isl::union_pw_aff scale_val(isl::val v) const; - inline isl::union_pw_aff sub(isl::union_pw_aff upa2) const; - inline isl::union_pw_aff subtract_domain(isl::space space) const; - inline isl::union_pw_aff subtract_domain(isl::union_set uset) const; - inline isl::union_pw_aff union_add(isl::union_pw_aff upa2) const; - inline isl::union_set zero_union_set() const; -}; +ast_expr::ast_expr(const ast_expr &obj) + : ptr(nullptr) +{ + ptr = obj.copy(); +} -// declarations for isl::union_pw_aff_list -inline union_pw_aff_list manage(__isl_take isl_union_pw_aff_list *ptr); -inline union_pw_aff_list manage_copy(__isl_keep isl_union_pw_aff_list *ptr); +ast_expr::ast_expr(__isl_take isl_ast_expr *ptr) + : ptr(ptr) {} -class union_pw_aff_list { - friend inline union_pw_aff_list manage(__isl_take isl_union_pw_aff_list *ptr); - friend inline union_pw_aff_list manage_copy(__isl_keep isl_union_pw_aff_list *ptr); +ast_expr &ast_expr::operator=(ast_expr obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} - isl_union_pw_aff_list *ptr = nullptr; +ast_expr::~ast_expr() { + if (ptr) + isl_ast_expr_free(ptr); +} - inline explicit union_pw_aff_list(__isl_take isl_union_pw_aff_list *ptr); +__isl_give isl_ast_expr *ast_expr::copy() const & { + return isl_ast_expr_copy(ptr); +} -public: - inline /* implicit */ union_pw_aff_list(); - inline /* implicit */ union_pw_aff_list(const union_pw_aff_list &obj); - inline union_pw_aff_list &operator=(union_pw_aff_list obj); - inline ~union_pw_aff_list(); - inline __isl_give isl_union_pw_aff_list *copy() const &; - inline __isl_give isl_union_pw_aff_list *copy() && = delete; - inline __isl_keep isl_union_pw_aff_list *get() const; - inline __isl_give isl_union_pw_aff_list *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; +__isl_keep isl_ast_expr *ast_expr::get() const { + return ptr; +} - inline isl::union_pw_aff_list add(isl::union_pw_aff el) const; - static inline isl::union_pw_aff_list alloc(isl::ctx ctx, int n); - inline isl::union_pw_aff_list clear() const; - inline isl::union_pw_aff_list concat(isl::union_pw_aff_list list2) const; - inline isl::union_pw_aff_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::union_pw_aff_list from_union_pw_aff(isl::union_pw_aff el); - inline isl::union_pw_aff get_at(int index) const; - inline isl::union_pw_aff get_union_pw_aff(int index) const; - inline isl::union_pw_aff_list insert(unsigned int pos, isl::union_pw_aff el) const; - inline isl_size n_union_pw_aff() const; - inline isl::union_pw_aff_list reverse() const; - inline isl::union_pw_aff_list set_union_pw_aff(int index, isl::union_pw_aff el) const; - inline isl_size size() const; - inline isl::union_pw_aff_list swap(unsigned int pos1, unsigned int pos2) const; -}; +__isl_give isl_ast_expr *ast_expr::release() { + isl_ast_expr *tmp = ptr; + ptr = nullptr; + return tmp; +} -// declarations for isl::union_pw_multi_aff -inline union_pw_multi_aff manage(__isl_take isl_union_pw_multi_aff *ptr); -inline union_pw_multi_aff manage_copy(__isl_keep isl_union_pw_multi_aff *ptr); +bool ast_expr::is_null() const { + return ptr == nullptr; +} -class union_pw_multi_aff { - friend inline union_pw_multi_aff manage(__isl_take isl_union_pw_multi_aff *ptr); - friend inline union_pw_multi_aff manage_copy(__isl_keep isl_union_pw_multi_aff *ptr); +template +boolean ast_expr::isa_type(T subtype) const +{ + if (is_null()) + return boolean(); + return isl_ast_expr_get_type(get()) == subtype; +} +template +boolean ast_expr::isa() const +{ + return isa_type(T::type); +} +template +T ast_expr::as() const +{ + if (isa().is_false()) + isl_die(ctx().get(), isl_error_invalid, "not an object of the requested subtype", return T()); + return T(copy()); +} - isl_union_pw_multi_aff *ptr = nullptr; +isl::ctx ast_expr::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); +} - inline explicit union_pw_multi_aff(__isl_take isl_union_pw_multi_aff *ptr); +isl::ast_expr ast_expr::add(isl::ast_expr expr2) const +{ + auto res = isl_ast_expr_add(copy(), expr2.release()); + return manage(res); +} -public: - inline /* implicit */ union_pw_multi_aff(); - inline /* implicit */ union_pw_multi_aff(const union_pw_multi_aff &obj); - inline /* implicit */ union_pw_multi_aff(isl::aff aff); - inline explicit union_pw_multi_aff(isl::union_set uset); - inline /* implicit */ union_pw_multi_aff(isl::multi_aff ma); - inline explicit union_pw_multi_aff(isl::multi_union_pw_aff mupa); - inline /* implicit */ union_pw_multi_aff(isl::pw_multi_aff pma); - inline explicit union_pw_multi_aff(isl::union_map umap); - inline /* implicit */ union_pw_multi_aff(isl::union_pw_aff upa); - inline explicit union_pw_multi_aff(isl::ctx ctx, const std::string &str); - inline union_pw_multi_aff &operator=(union_pw_multi_aff obj); - inline ~union_pw_multi_aff(); - inline __isl_give isl_union_pw_multi_aff *copy() const &; - inline __isl_give isl_union_pw_multi_aff *copy() && = delete; - inline __isl_keep isl_union_pw_multi_aff *get() const; - inline __isl_give isl_union_pw_multi_aff *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; +isl::ast_expr ast_expr::address_of() const +{ + auto res = isl_ast_expr_address_of(copy()); + return manage(res); +} - inline isl::union_pw_multi_aff add(isl::union_pw_multi_aff upma2) const; - inline isl::union_pw_multi_aff add_pw_multi_aff(isl::pw_multi_aff pma) const; - inline isl::union_pw_multi_aff align_params(isl::space model) const; - inline isl::union_pw_multi_aff apply(isl::union_pw_multi_aff upma2) const; - inline isl::pw_multi_aff as_pw_multi_aff() const; - inline isl::union_pw_multi_aff coalesce() const; - inline isl_size dim(isl::dim type) const; - inline isl::union_set domain() const; - inline isl::union_pw_multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - static inline isl::union_pw_multi_aff empty(isl::space space); - static inline isl::union_pw_multi_aff empty(isl::ctx ctx); - static inline isl::union_pw_multi_aff empty_space(isl::space space); - inline isl::pw_multi_aff extract_pw_multi_aff(isl::space space) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline isl::union_pw_multi_aff flat_range_product(isl::union_pw_multi_aff upma2) const; - inline stat foreach_pw_multi_aff(const std::function &fn) const; - static inline isl::union_pw_multi_aff from_union_set(isl::union_set uset); - inline isl::pw_multi_aff_list get_pw_multi_aff_list() const; - inline isl::space get_space() const; - inline isl::union_pw_aff get_union_pw_aff(int pos) const; - inline isl::union_pw_multi_aff gist(isl::union_set context) const; - inline isl::union_pw_multi_aff gist_params(isl::set context) const; - inline isl::union_pw_multi_aff intersect_domain(isl::space space) const; - inline isl::union_pw_multi_aff intersect_domain(isl::union_set uset) const; - inline isl::union_pw_multi_aff intersect_domain_wrapped_domain(isl::union_set uset) const; - inline isl::union_pw_multi_aff intersect_domain_wrapped_range(isl::union_set uset) const; - inline isl::union_pw_multi_aff intersect_params(isl::set set) const; - inline boolean involves_locals() const; - inline boolean involves_nan() const; - inline boolean isa_pw_multi_aff() const; - static inline isl::union_pw_multi_aff multi_val_on_domain(isl::union_set domain, isl::multi_val mv); - inline isl_size n_pw_multi_aff() const; - inline isl::union_pw_multi_aff neg() const; - inline boolean plain_is_empty() const; - inline boolean plain_is_equal(const isl::union_pw_multi_aff &upma2) const; - inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(isl::union_pw_multi_aff upma2) const; - inline isl::union_pw_multi_aff pullback(isl::union_pw_multi_aff upma2) const; - inline isl::union_pw_multi_aff range_factor_domain() const; - inline isl::union_pw_multi_aff range_factor_range() const; - inline isl::union_pw_multi_aff range_product(isl::union_pw_multi_aff upma2) const; - inline isl::union_pw_multi_aff reset_user() const; - inline isl::union_pw_multi_aff scale_down_val(isl::val val) const; - inline isl::union_pw_multi_aff scale_multi_val(isl::multi_val mv) const; - inline isl::union_pw_multi_aff scale_val(isl::val val) const; - inline isl::union_pw_multi_aff sub(isl::union_pw_multi_aff upma2) const; - inline isl::union_pw_multi_aff subtract_domain(isl::space space) const; - inline isl::union_pw_multi_aff subtract_domain(isl::union_set uset) const; - inline isl::union_pw_multi_aff union_add(isl::union_pw_multi_aff upma2) const; -}; +isl::ast_expr ast_expr::eq(isl::ast_expr expr2) const +{ + auto res = isl_ast_expr_eq(copy(), expr2.release()); + return manage(res); +} + +isl::ast_expr ast_expr::from_val(isl::val v) +{ + auto res = isl_ast_expr_from_val(v.release()); + return manage(res); +} + +isl::id ast_expr::id() const +{ + auto res = isl_ast_expr_get_id(get()); + return manage(res); +} + +isl::id ast_expr::get_id() const +{ + return id(); +} + +isl::ast_expr ast_expr::le(isl::ast_expr expr2) const +{ + auto res = isl_ast_expr_le(copy(), expr2.release()); + return manage(res); +} + +isl::ast_expr ast_expr::mul(isl::ast_expr expr2) const +{ + auto res = isl_ast_expr_mul(copy(), expr2.release()); + return manage(res); +} -// declarations for isl::union_pw_multi_aff_list -inline union_pw_multi_aff_list manage(__isl_take isl_union_pw_multi_aff_list *ptr); -inline union_pw_multi_aff_list manage_copy(__isl_keep isl_union_pw_multi_aff_list *ptr); +isl::ast_expr ast_expr::op_arg(int pos) const +{ + auto res = isl_ast_expr_get_op_arg(get(), pos); + return manage(res); +} -class union_pw_multi_aff_list { - friend inline union_pw_multi_aff_list manage(__isl_take isl_union_pw_multi_aff_list *ptr); - friend inline union_pw_multi_aff_list manage_copy(__isl_keep isl_union_pw_multi_aff_list *ptr); +isl::ast_expr ast_expr::get_op_arg(int pos) const +{ + return op_arg(pos); +} - isl_union_pw_multi_aff_list *ptr = nullptr; +std::string ast_expr::to_C_str() const +{ + auto res = isl_ast_expr_to_C_str(get()); + std::string tmp(res); + free(res); + return tmp; +} - inline explicit union_pw_multi_aff_list(__isl_take isl_union_pw_multi_aff_list *ptr); +isl::val ast_expr::val() const +{ + auto res = isl_ast_expr_get_val(get()); + return manage(res); +} -public: - inline /* implicit */ union_pw_multi_aff_list(); - inline /* implicit */ union_pw_multi_aff_list(const union_pw_multi_aff_list &obj); - inline union_pw_multi_aff_list &operator=(union_pw_multi_aff_list obj); - inline ~union_pw_multi_aff_list(); - inline __isl_give isl_union_pw_multi_aff_list *copy() const &; - inline __isl_give isl_union_pw_multi_aff_list *copy() && = delete; - inline __isl_keep isl_union_pw_multi_aff_list *get() const; - inline __isl_give isl_union_pw_multi_aff_list *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::union_pw_multi_aff_list add(isl::union_pw_multi_aff el) const; - static inline isl::union_pw_multi_aff_list alloc(isl::ctx ctx, int n); - inline isl::union_pw_multi_aff_list clear() const; - inline isl::union_pw_multi_aff_list concat(isl::union_pw_multi_aff_list list2) const; - inline isl::union_pw_multi_aff_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::union_pw_multi_aff_list from_union_pw_multi_aff(isl::union_pw_multi_aff el); - inline isl::union_pw_multi_aff get_at(int index) const; - inline isl::union_pw_multi_aff get_union_pw_multi_aff(int index) const; - inline isl::union_pw_multi_aff_list insert(unsigned int pos, isl::union_pw_multi_aff el) const; - inline isl_size n_union_pw_multi_aff() const; - inline isl::union_pw_multi_aff_list reverse() const; - inline isl::union_pw_multi_aff_list set_union_pw_multi_aff(int index, isl::union_pw_multi_aff el) const; - inline isl_size size() const; - inline isl::union_pw_multi_aff_list swap(unsigned int pos1, unsigned int pos2) const; -}; +isl::val ast_expr::get_val() const +{ + return val(); +} -// declarations for isl::union_pw_qpolynomial -inline union_pw_qpolynomial manage(__isl_take isl_union_pw_qpolynomial *ptr); -inline union_pw_qpolynomial manage_copy(__isl_keep isl_union_pw_qpolynomial *ptr); +inline std::ostream &operator<<(std::ostream &os, const ast_expr &obj) +{ + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} -class union_pw_qpolynomial { - friend inline union_pw_qpolynomial manage(__isl_take isl_union_pw_qpolynomial *ptr); - friend inline union_pw_qpolynomial manage_copy(__isl_keep isl_union_pw_qpolynomial *ptr); +// implementations for isl::ast_expr_id +ast_expr_id::ast_expr_id() + : ast_expr() {} - isl_union_pw_qpolynomial *ptr = nullptr; +ast_expr_id::ast_expr_id(const ast_expr_id &obj) + : ast_expr(obj) +{ +} - inline explicit union_pw_qpolynomial(__isl_take isl_union_pw_qpolynomial *ptr); +ast_expr_id::ast_expr_id(__isl_take isl_ast_expr *ptr) + : ast_expr(ptr) {} -public: - inline /* implicit */ union_pw_qpolynomial(); - inline /* implicit */ union_pw_qpolynomial(const union_pw_qpolynomial &obj); - inline explicit union_pw_qpolynomial(isl::ctx ctx, const std::string &str); - inline union_pw_qpolynomial &operator=(union_pw_qpolynomial obj); - inline ~union_pw_qpolynomial(); - inline __isl_give isl_union_pw_qpolynomial *copy() const &; - inline __isl_give isl_union_pw_qpolynomial *copy() && = delete; - inline __isl_keep isl_union_pw_qpolynomial *get() const; - inline __isl_give isl_union_pw_qpolynomial *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; +ast_expr_id &ast_expr_id::operator=(ast_expr_id obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} - inline isl::union_pw_qpolynomial add(isl::union_pw_qpolynomial upwqp2) const; - inline isl::union_pw_qpolynomial add_pw_qpolynomial(isl::pw_qpolynomial pwqp) const; - inline isl::union_pw_qpolynomial align_params(isl::space model) const; - inline isl::union_pw_qpolynomial coalesce() const; - inline isl_size dim(isl::dim type) const; - inline isl::union_set domain() const; - inline isl::union_pw_qpolynomial drop_dims(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::val eval(isl::point pnt) const; - inline isl::pw_qpolynomial extract_pw_qpolynomial(isl::space space) const; - inline int find_dim_by_name(isl::dim type, const std::string &name) const; - inline stat foreach_pw_qpolynomial(const std::function &fn) const; - static inline isl::union_pw_qpolynomial from_pw_qpolynomial(isl::pw_qpolynomial pwqp); - inline isl::pw_qpolynomial_list get_pw_qpolynomial_list() const; - inline isl::space get_space() const; - inline isl::union_pw_qpolynomial gist(isl::union_set context) const; - inline isl::union_pw_qpolynomial gist_params(isl::set context) const; - inline isl::union_pw_qpolynomial intersect_domain(isl::union_set uset) const; - inline isl::union_pw_qpolynomial intersect_domain_space(isl::space space) const; - inline isl::union_pw_qpolynomial intersect_domain_union_set(isl::union_set uset) const; - inline isl::union_pw_qpolynomial intersect_domain_wrapped_domain(isl::union_set uset) const; - inline isl::union_pw_qpolynomial intersect_domain_wrapped_range(isl::union_set uset) const; - inline isl::union_pw_qpolynomial intersect_params(isl::set set) const; - inline boolean involves_nan() const; - inline isl::union_pw_qpolynomial mul(isl::union_pw_qpolynomial upwqp2) const; - inline isl_size n_pw_qpolynomial() const; - inline isl::union_pw_qpolynomial neg() const; - inline boolean plain_is_equal(const isl::union_pw_qpolynomial &upwqp2) const; - inline isl::union_pw_qpolynomial reset_user() const; - inline isl::union_pw_qpolynomial scale_down_val(isl::val v) const; - inline isl::union_pw_qpolynomial scale_val(isl::val v) const; - inline isl::union_pw_qpolynomial sub(isl::union_pw_qpolynomial upwqp2) const; - inline isl::union_pw_qpolynomial subtract_domain(isl::union_set uset) const; - inline isl::union_pw_qpolynomial subtract_domain_space(isl::space space) const; - inline isl::union_pw_qpolynomial subtract_domain_union_set(isl::union_set uset) const; - inline isl::union_pw_qpolynomial to_polynomial(int sign) const; - static inline isl::union_pw_qpolynomial zero(isl::space space); - static inline isl::union_pw_qpolynomial zero_ctx(isl::ctx ctx); - static inline isl::union_pw_qpolynomial zero_space(isl::space space); -}; +isl::ctx ast_expr_id::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); +} -// declarations for isl::union_set -inline union_set manage(__isl_take isl_union_set *ptr); -inline union_set manage_copy(__isl_keep isl_union_set *ptr); +isl::id ast_expr_id::id() const +{ + auto res = isl_ast_expr_id_get_id(get()); + return manage(res); +} -class union_set { - friend inline union_set manage(__isl_take isl_union_set *ptr); - friend inline union_set manage_copy(__isl_keep isl_union_set *ptr); +isl::id ast_expr_id::get_id() const +{ + return id(); +} - isl_union_set *ptr = nullptr; +inline std::ostream &operator<<(std::ostream &os, const ast_expr_id &obj) +{ + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} - inline explicit union_set(__isl_take isl_union_set *ptr); +// implementations for isl::ast_expr_int +ast_expr_int::ast_expr_int() + : ast_expr() {} -public: - inline /* implicit */ union_set(); - inline /* implicit */ union_set(const union_set &obj); - inline /* implicit */ union_set(isl::basic_set bset); - inline /* implicit */ union_set(isl::point pnt); - inline /* implicit */ union_set(isl::set set); - inline explicit union_set(isl::ctx ctx, const std::string &str); - inline union_set &operator=(union_set obj); - inline ~union_set(); - inline __isl_give isl_union_set *copy() const &; - inline __isl_give isl_union_set *copy() && = delete; - inline __isl_keep isl_union_set *get() const; - inline __isl_give isl_union_set *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; +ast_expr_int::ast_expr_int(const ast_expr_int &obj) + : ast_expr(obj) +{ +} - inline isl::union_set affine_hull() const; - inline isl::union_set align_params(isl::space model) const; - inline isl::union_set apply(isl::union_map umap) const; - inline isl::union_set coalesce() const; - inline isl::union_set coefficients() const; - inline isl::schedule compute_schedule(isl::union_map validity, isl::union_map proximity) const; - inline boolean contains(const isl::space &space) const; - inline isl::union_set detect_equalities() const; - inline isl_size dim(isl::dim type) const; - static inline isl::union_set empty(isl::ctx ctx); - inline isl::set extract_set(isl::space space) const; - inline stat foreach_point(const std::function &fn) const; - inline stat foreach_set(const std::function &fn) const; - inline isl::basic_set_list get_basic_set_list() const; - inline uint32_t get_hash() const; - inline isl::set_list get_set_list() const; - inline isl::space get_space() const; - inline isl::union_set gist(isl::union_set context) const; - inline isl::union_set gist_params(isl::set set) const; - inline isl::union_map identity() const; - inline isl::union_pw_multi_aff identity_union_pw_multi_aff() const; - inline isl::union_set intersect(isl::union_set uset2) const; - inline isl::union_set intersect_params(isl::set set) const; - inline boolean is_disjoint(const isl::union_set &uset2) const; - inline boolean is_empty() const; - inline boolean is_equal(const isl::union_set &uset2) const; - inline boolean is_params() const; - inline boolean is_strict_subset(const isl::union_set &uset2) const; - inline boolean is_subset(const isl::union_set &uset2) const; - inline boolean isa_set() const; - inline isl::union_map lex_ge_union_set(isl::union_set uset2) const; - inline isl::union_map lex_gt_union_set(isl::union_set uset2) const; - inline isl::union_map lex_le_union_set(isl::union_set uset2) const; - inline isl::union_map lex_lt_union_set(isl::union_set uset2) const; - inline isl::union_set lexmax() const; - inline isl::union_set lexmin() const; - inline isl::multi_val min_multi_union_pw_aff(const isl::multi_union_pw_aff &obj) const; - inline isl_size n_set() const; - inline isl::set params() const; - inline isl::union_set polyhedral_hull() const; - inline isl::union_set preimage(isl::multi_aff ma) const; - inline isl::union_set preimage(isl::pw_multi_aff pma) const; - inline isl::union_set preimage(isl::union_pw_multi_aff upma) const; - inline isl::union_set product(isl::union_set uset2) const; - inline isl::union_set project_out(isl::dim type, unsigned int first, unsigned int n) const; - inline isl::union_set project_out_all_params() const; - inline isl::union_set remove_divs() const; - inline isl::union_set remove_redundancies() const; - inline isl::union_set reset_user() const; - inline isl::basic_set sample() const; - inline isl::point sample_point() const; - inline isl::union_set simple_hull() const; - inline isl::union_set solutions() const; - inline isl::union_set subtract(isl::union_set uset2) const; - inline isl::union_set unite(isl::union_set uset2) const; - inline isl::union_set universe() const; - inline isl::union_map unwrap() const; - inline isl::union_map wrapped_domain_map() const; -}; +ast_expr_int::ast_expr_int(__isl_take isl_ast_expr *ptr) + : ast_expr(ptr) {} -// declarations for isl::union_set_list -inline union_set_list manage(__isl_take isl_union_set_list *ptr); -inline union_set_list manage_copy(__isl_keep isl_union_set_list *ptr); +ast_expr_int &ast_expr_int::operator=(ast_expr_int obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} -class union_set_list { - friend inline union_set_list manage(__isl_take isl_union_set_list *ptr); - friend inline union_set_list manage_copy(__isl_keep isl_union_set_list *ptr); +isl::ctx ast_expr_int::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); +} - isl_union_set_list *ptr = nullptr; +isl::val ast_expr_int::val() const +{ + auto res = isl_ast_expr_int_get_val(get()); + return manage(res); +} - inline explicit union_set_list(__isl_take isl_union_set_list *ptr); +isl::val ast_expr_int::get_val() const +{ + return val(); +} -public: - inline /* implicit */ union_set_list(); - inline /* implicit */ union_set_list(const union_set_list &obj); - inline union_set_list &operator=(union_set_list obj); - inline ~union_set_list(); - inline __isl_give isl_union_set_list *copy() const &; - inline __isl_give isl_union_set_list *copy() && = delete; - inline __isl_keep isl_union_set_list *get() const; - inline __isl_give isl_union_set_list *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; +inline std::ostream &operator<<(std::ostream &os, const ast_expr_int &obj) +{ + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} - inline isl::union_set_list add(isl::union_set el) const; - static inline isl::union_set_list alloc(isl::ctx ctx, int n); - inline isl::union_set_list clear() const; - inline isl::union_set_list concat(isl::union_set_list list2) const; - inline isl::union_set_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::union_set_list from_union_set(isl::union_set el); - inline isl::union_set get_at(int index) const; - inline isl::union_set get_union_set(int index) const; - inline isl::union_set_list insert(unsigned int pos, isl::union_set el) const; - inline isl_size n_union_set() const; - inline isl::union_set_list reverse() const; - inline isl::union_set_list set_union_set(int index, isl::union_set el) const; - inline isl_size size() const; - inline isl::union_set_list swap(unsigned int pos1, unsigned int pos2) const; - inline isl::union_set unite() const; -}; +// implementations for isl::ast_expr_op +ast_expr_op::ast_expr_op() + : ast_expr() {} -// declarations for isl::val -inline val manage(__isl_take isl_val *ptr); -inline val manage_copy(__isl_keep isl_val *ptr); +ast_expr_op::ast_expr_op(const ast_expr_op &obj) + : ast_expr(obj) +{ +} -class val { - friend inline val manage(__isl_take isl_val *ptr); - friend inline val manage_copy(__isl_keep isl_val *ptr); +ast_expr_op::ast_expr_op(__isl_take isl_ast_expr *ptr) + : ast_expr(ptr) {} - isl_val *ptr = nullptr; +ast_expr_op &ast_expr_op::operator=(ast_expr_op obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} - inline explicit val(__isl_take isl_val *ptr); +template +boolean ast_expr_op::isa_type(T subtype) const +{ + if (is_null()) + return boolean(); + return isl_ast_expr_op_get_type(get()) == subtype; +} +template +boolean ast_expr_op::isa() const +{ + return isa_type(T::type); +} +template +T ast_expr_op::as() const +{ + if (isa().is_false()) + isl_die(ctx().get(), isl_error_invalid, "not an object of the requested subtype", return T()); + return T(copy()); +} -public: - inline /* implicit */ val(); - inline /* implicit */ val(const val &obj); - inline explicit val(isl::ctx ctx, long i); - inline explicit val(isl::ctx ctx, const std::string &str); - inline val &operator=(val obj); - inline ~val(); - inline __isl_give isl_val *copy() const &; - inline __isl_give isl_val *copy() && = delete; - inline __isl_keep isl_val *get() const; - inline __isl_give isl_val *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; +isl::ctx ast_expr_op::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); +} - inline isl::val abs() const; - inline boolean abs_eq(const isl::val &v2) const; - inline isl::val add(isl::val v2) const; - inline isl::val add_ui(unsigned long v2) const; - inline isl::val ceil() const; - inline int cmp_si(long i) const; - inline isl::val div(isl::val v2) const; - inline isl::val div_ui(unsigned long v2) const; - inline boolean eq(const isl::val &v2) const; - inline boolean eq_si(long i) const; - inline isl::val floor() const; - inline isl::val gcd(isl::val v2) const; - inline boolean ge(const isl::val &v2) const; - inline uint32_t get_hash() const; - inline long get_num_si() const; - inline boolean gt(const isl::val &v2) const; - inline boolean gt_si(long i) const; - static inline isl::val infty(isl::ctx ctx); - static inline isl::val int_from_ui(isl::ctx ctx, unsigned long u); - inline isl::val inv() const; - inline boolean is_divisible_by(const isl::val &v2) const; - inline boolean is_infty() const; - inline boolean is_int() const; - inline boolean is_nan() const; - inline boolean is_neg() const; - inline boolean is_neginfty() const; - inline boolean is_negone() const; - inline boolean is_nonneg() const; - inline boolean is_nonpos() const; - inline boolean is_one() const; - inline boolean is_pos() const; - inline boolean is_rat() const; - inline boolean is_zero() const; - inline boolean le(const isl::val &v2) const; - inline boolean lt(const isl::val &v2) const; - inline isl::val max(isl::val v2) const; - inline isl::val min(isl::val v2) const; - inline isl::val mod(isl::val v2) const; - inline isl::val mul(isl::val v2) const; - inline isl::val mul_ui(unsigned long v2) const; - inline isl_size n_abs_num_chunks(size_t size) const; - static inline isl::val nan(isl::ctx ctx); - inline boolean ne(const isl::val &v2) const; - inline isl::val neg() const; - static inline isl::val neginfty(isl::ctx ctx); - static inline isl::val negone(isl::ctx ctx); - static inline isl::val one(isl::ctx ctx); - inline isl::val pow2() const; - inline isl::val set_si(long i) const; - inline int sgn() const; - inline isl::val sub(isl::val v2) const; - inline isl::val sub_ui(unsigned long v2) const; - inline isl::val trunc() const; - static inline isl::val zero(isl::ctx ctx); -}; +isl::ast_expr ast_expr_op::arg(int pos) const +{ + auto res = isl_ast_expr_op_get_arg(get(), pos); + return manage(res); +} -// declarations for isl::val_list -inline val_list manage(__isl_take isl_val_list *ptr); -inline val_list manage_copy(__isl_keep isl_val_list *ptr); +isl::ast_expr ast_expr_op::get_arg(int pos) const +{ + return arg(pos); +} + +class size ast_expr_op::n_arg() const +{ + auto res = isl_ast_expr_op_get_n_arg(get()); + return manage(res); +} + +class size ast_expr_op::get_n_arg() const +{ + return n_arg(); +} + +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op &obj) +{ + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} -class val_list { - friend inline val_list manage(__isl_take isl_val_list *ptr); - friend inline val_list manage_copy(__isl_keep isl_val_list *ptr); +// implementations for isl::ast_expr_op_access +ast_expr_op_access::ast_expr_op_access() + : ast_expr_op() {} - isl_val_list *ptr = nullptr; +ast_expr_op_access::ast_expr_op_access(const ast_expr_op_access &obj) + : ast_expr_op(obj) +{ +} - inline explicit val_list(__isl_take isl_val_list *ptr); +ast_expr_op_access::ast_expr_op_access(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} -public: - inline /* implicit */ val_list(); - inline /* implicit */ val_list(const val_list &obj); - inline val_list &operator=(val_list obj); - inline ~val_list(); - inline __isl_give isl_val_list *copy() const &; - inline __isl_give isl_val_list *copy() && = delete; - inline __isl_keep isl_val_list *get() const; - inline __isl_give isl_val_list *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; +ast_expr_op_access &ast_expr_op_access::operator=(ast_expr_op_access obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} - inline isl::val_list add(isl::val el) const; - static inline isl::val_list alloc(isl::ctx ctx, int n); - inline isl::val_list clear() const; - inline isl::val_list concat(isl::val_list list2) const; - inline isl::val_list drop(unsigned int first, unsigned int n) const; - inline stat foreach(const std::function &fn) const; - static inline isl::val_list from_val(isl::val el); - inline isl::val get_at(int index) const; - inline isl::val get_val(int index) const; - inline isl::val_list insert(unsigned int pos, isl::val el) const; - inline isl_size n_val() const; - inline isl::val_list reverse() const; - inline isl::val_list set_val(int index, isl::val el) const; - inline isl_size size() const; - inline isl::val_list swap(unsigned int pos1, unsigned int pos2) const; -}; +isl::ctx ast_expr_op_access::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); +} -// declarations for isl::vec -inline vec manage(__isl_take isl_vec *ptr); -inline vec manage_copy(__isl_keep isl_vec *ptr); +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_access &obj) +{ + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} -class vec { - friend inline vec manage(__isl_take isl_vec *ptr); - friend inline vec manage_copy(__isl_keep isl_vec *ptr); +// implementations for isl::ast_expr_op_add +ast_expr_op_add::ast_expr_op_add() + : ast_expr_op() {} - isl_vec *ptr = nullptr; +ast_expr_op_add::ast_expr_op_add(const ast_expr_op_add &obj) + : ast_expr_op(obj) +{ +} - inline explicit vec(__isl_take isl_vec *ptr); +ast_expr_op_add::ast_expr_op_add(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} -public: - inline /* implicit */ vec(); - inline /* implicit */ vec(const vec &obj); - inline vec &operator=(vec obj); - inline ~vec(); - inline __isl_give isl_vec *copy() const &; - inline __isl_give isl_vec *copy() && = delete; - inline __isl_keep isl_vec *get() const; - inline __isl_give isl_vec *release(); - inline bool is_null() const; - inline isl::ctx ctx() const; - inline void dump() const; - - inline isl::vec add(isl::vec vec2) const; - inline isl::vec add_els(unsigned int n) const; - static inline isl::vec alloc(isl::ctx ctx, unsigned int size); - inline isl::vec ceil() const; - inline isl::vec clr() const; - inline int cmp_element(const isl::vec &vec2, int pos) const; - inline isl::vec concat(isl::vec vec2) const; - inline isl::vec drop_els(unsigned int pos, unsigned int n) const; - inline isl::vec extend(unsigned int size) const; - inline isl::val get_element_val(int pos) const; - inline isl::vec insert_els(unsigned int pos, unsigned int n) const; - inline isl::vec insert_zero_els(unsigned int pos, unsigned int n) const; - inline boolean is_equal(const isl::vec &vec2) const; - inline isl::vec mat_product(isl::mat mat) const; - inline isl::vec move_els(unsigned int dst_col, unsigned int src_col, unsigned int n) const; - inline isl::vec neg() const; - inline isl::vec set_element_si(int pos, int v) const; - inline isl::vec set_element_val(int pos, isl::val v) const; - inline isl::vec set_si(int v) const; - inline isl::vec set_val(isl::val v) const; - inline isl_size size() const; - inline isl::vec sort() const; - static inline isl::vec zero(isl::ctx ctx, unsigned int size); - inline isl::vec zero_extend(unsigned int size) const; -}; +ast_expr_op_add &ast_expr_op_add::operator=(ast_expr_op_add obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} -// implementations for isl::aff -aff manage(__isl_take isl_aff *ptr) { - return aff(ptr); +isl::ctx ast_expr_op_add::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -aff manage_copy(__isl_keep isl_aff *ptr) { - ptr = isl_aff_copy(ptr); - return aff(ptr); + +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_add &obj) +{ + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -aff::aff() - : ptr(nullptr) {} +// implementations for isl::ast_expr_op_address_of +ast_expr_op_address_of::ast_expr_op_address_of() + : ast_expr_op() {} -aff::aff(const aff &obj) - : ptr(nullptr) +ast_expr_op_address_of::ast_expr_op_address_of(const ast_expr_op_address_of &obj) + : ast_expr_op(obj) { - ptr = obj.copy(); } +ast_expr_op_address_of::ast_expr_op_address_of(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} -aff::aff(__isl_take isl_aff *ptr) - : ptr(ptr) {} +ast_expr_op_address_of &ast_expr_op_address_of::operator=(ast_expr_op_address_of obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} -aff::aff(isl::ctx ctx, const std::string &str) -{ - auto res = isl_aff_read_from_str(ctx.release(), str.c_str()); - ptr = res; +isl::ctx ast_expr_op_address_of::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -aff::aff(isl::local_space ls, isl::val val) + +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_address_of &obj) { - auto res = isl_aff_val_on_domain(ls.release(), val.release()); - ptr = res; + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -aff::aff(isl::local_space ls) + +// implementations for isl::ast_expr_op_and +ast_expr_op_and::ast_expr_op_and() + : ast_expr_op() {} + +ast_expr_op_and::ast_expr_op_and(const ast_expr_op_and &obj) + : ast_expr_op(obj) { - auto res = isl_aff_zero_on_domain(ls.release()); - ptr = res; } -aff &aff::operator=(aff obj) { +ast_expr_op_and::ast_expr_op_and(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_and &ast_expr_op_and::operator=(ast_expr_op_and obj) { std::swap(this->ptr, obj.ptr); return *this; } -aff::~aff() { - if (ptr) - isl_aff_free(ptr); +isl::ctx ast_expr_op_and::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -__isl_give isl_aff *aff::copy() const & { - return isl_aff_copy(ptr); +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_and &obj) +{ + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -__isl_keep isl_aff *aff::get() const { - return ptr; +// implementations for isl::ast_expr_op_and_then +ast_expr_op_and_then::ast_expr_op_and_then() + : ast_expr_op() {} + +ast_expr_op_and_then::ast_expr_op_and_then(const ast_expr_op_and_then &obj) + : ast_expr_op(obj) +{ } -__isl_give isl_aff *aff::release() { - isl_aff *tmp = ptr; - ptr = nullptr; - return tmp; +ast_expr_op_and_then::ast_expr_op_and_then(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_and_then &ast_expr_op_and_then::operator=(ast_expr_op_and_then obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -bool aff::is_null() const { - return ptr == nullptr; +isl::ctx ast_expr_op_and_then::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); +} + +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_and_then &obj) +{ + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } +// implementations for isl::ast_expr_op_call +ast_expr_op_call::ast_expr_op_call() + : ast_expr_op() {} -isl::ctx aff::ctx() const { - return isl::ctx(isl_aff_get_ctx(ptr)); +ast_expr_op_call::ast_expr_op_call(const ast_expr_op_call &obj) + : ast_expr_op(obj) +{ } -void aff::dump() const { - isl_aff_dump(get()); +ast_expr_op_call::ast_expr_op_call(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_call &ast_expr_op_call::operator=(ast_expr_op_call obj) { + std::swap(this->ptr, obj.ptr); + return *this; } +isl::ctx ast_expr_op_call::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); +} -isl::aff aff::add(isl::aff aff2) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_call &obj) { - auto res = isl_aff_add(copy(), aff2.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::aff aff::add_coefficient_si(isl::dim type, int pos, int v) const +// implementations for isl::ast_expr_op_cond +ast_expr_op_cond::ast_expr_op_cond() + : ast_expr_op() {} + +ast_expr_op_cond::ast_expr_op_cond(const ast_expr_op_cond &obj) + : ast_expr_op(obj) { - auto res = isl_aff_add_coefficient_si(copy(), static_cast(type), pos, v); - return manage(res); } -isl::aff aff::add_coefficient_val(isl::dim type, int pos, isl::val v) const -{ - auto res = isl_aff_add_coefficient_val(copy(), static_cast(type), pos, v.release()); - return manage(res); +ast_expr_op_cond::ast_expr_op_cond(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_cond &ast_expr_op_cond::operator=(ast_expr_op_cond obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::aff aff::add_constant(isl::val v) const -{ - auto res = isl_aff_add_constant_val(copy(), v.release()); - return manage(res); +isl::ctx ast_expr_op_cond::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::aff aff::add_constant_num_si(int v) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_cond &obj) { - auto res = isl_aff_add_constant_num_si(copy(), v); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::aff aff::add_constant_si(int v) const +// implementations for isl::ast_expr_op_div +ast_expr_op_div::ast_expr_op_div() + : ast_expr_op() {} + +ast_expr_op_div::ast_expr_op_div(const ast_expr_op_div &obj) + : ast_expr_op(obj) { - auto res = isl_aff_add_constant_si(copy(), v); - return manage(res); } -isl::aff aff::add_dims(isl::dim type, unsigned int n) const -{ - auto res = isl_aff_add_dims(copy(), static_cast(type), n); - return manage(res); +ast_expr_op_div::ast_expr_op_div(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_div &ast_expr_op_div::operator=(ast_expr_op_div obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::aff aff::align_params(isl::space model) const -{ - auto res = isl_aff_align_params(copy(), model.release()); - return manage(res); +isl::ctx ast_expr_op_div::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::basic_set aff::bind(isl::id id) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_div &obj) { - auto res = isl_aff_bind_id(copy(), id.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::aff aff::ceil() const +// implementations for isl::ast_expr_op_eq +ast_expr_op_eq::ast_expr_op_eq() + : ast_expr_op() {} + +ast_expr_op_eq::ast_expr_op_eq(const ast_expr_op_eq &obj) + : ast_expr_op(obj) { - auto res = isl_aff_ceil(copy()); - return manage(res); } -int aff::coefficient_sgn(isl::dim type, int pos) const -{ - auto res = isl_aff_coefficient_sgn(get(), static_cast(type), pos); - return res; +ast_expr_op_eq::ast_expr_op_eq(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_eq &ast_expr_op_eq::operator=(ast_expr_op_eq obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl_size aff::dim(isl::dim type) const -{ - auto res = isl_aff_dim(get(), static_cast(type)); - return res; +isl::ctx ast_expr_op_eq::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::aff aff::div(isl::aff aff2) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_eq &obj) { - auto res = isl_aff_div(copy(), aff2.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::aff aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +// implementations for isl::ast_expr_op_fdiv_q +ast_expr_op_fdiv_q::ast_expr_op_fdiv_q() + : ast_expr_op() {} + +ast_expr_op_fdiv_q::ast_expr_op_fdiv_q(const ast_expr_op_fdiv_q &obj) + : ast_expr_op(obj) { - auto res = isl_aff_drop_dims(copy(), static_cast(type), first, n); - return manage(res); } -isl::basic_set aff::eq_basic_set(isl::aff aff2) const -{ - auto res = isl_aff_eq_basic_set(copy(), aff2.release()); - return manage(res); +ast_expr_op_fdiv_q::ast_expr_op_fdiv_q(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_fdiv_q &ast_expr_op_fdiv_q::operator=(ast_expr_op_fdiv_q obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::set aff::eq_set(isl::aff aff2) const -{ - auto res = isl_aff_eq_set(copy(), aff2.release()); - return manage(res); +isl::ctx ast_expr_op_fdiv_q::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::val aff::eval(isl::point pnt) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_fdiv_q &obj) { - auto res = isl_aff_eval(copy(), pnt.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -int aff::find_dim_by_name(isl::dim type, const std::string &name) const +// implementations for isl::ast_expr_op_ge +ast_expr_op_ge::ast_expr_op_ge() + : ast_expr_op() {} + +ast_expr_op_ge::ast_expr_op_ge(const ast_expr_op_ge &obj) + : ast_expr_op(obj) { - auto res = isl_aff_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; } -isl::aff aff::floor() const -{ - auto res = isl_aff_floor(copy()); - return manage(res); +ast_expr_op_ge::ast_expr_op_ge(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_ge &ast_expr_op_ge::operator=(ast_expr_op_ge obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::aff aff::from_range() const -{ - auto res = isl_aff_from_range(copy()); - return manage(res); +isl::ctx ast_expr_op_ge::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::basic_set aff::ge_basic_set(isl::aff aff2) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_ge &obj) { - auto res = isl_aff_ge_basic_set(copy(), aff2.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::set aff::ge_set(isl::aff aff2) const +// implementations for isl::ast_expr_op_gt +ast_expr_op_gt::ast_expr_op_gt() + : ast_expr_op() {} + +ast_expr_op_gt::ast_expr_op_gt(const ast_expr_op_gt &obj) + : ast_expr_op(obj) { - auto res = isl_aff_ge_set(copy(), aff2.release()); - return manage(res); } -isl::val aff::get_coefficient_val(isl::dim type, int pos) const +ast_expr_op_gt::ast_expr_op_gt(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_gt &ast_expr_op_gt::operator=(ast_expr_op_gt obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +isl::ctx ast_expr_op_gt::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); +} + +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_gt &obj) { - auto res = isl_aff_get_coefficient_val(get(), static_cast(type), pos); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::val aff::get_constant_val() const +// implementations for isl::ast_expr_op_le +ast_expr_op_le::ast_expr_op_le() + : ast_expr_op() {} + +ast_expr_op_le::ast_expr_op_le(const ast_expr_op_le &obj) + : ast_expr_op(obj) { - auto res = isl_aff_get_constant_val(get()); - return manage(res); } -isl::val aff::get_denominator_val() const +ast_expr_op_le::ast_expr_op_le(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_le &ast_expr_op_le::operator=(ast_expr_op_le obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +isl::ctx ast_expr_op_le::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); +} + +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_le &obj) { - auto res = isl_aff_get_denominator_val(get()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -std::string aff::get_dim_name(isl::dim type, unsigned int pos) const +// implementations for isl::ast_expr_op_lt +ast_expr_op_lt::ast_expr_op_lt() + : ast_expr_op() {} + +ast_expr_op_lt::ast_expr_op_lt(const ast_expr_op_lt &obj) + : ast_expr_op(obj) { - auto res = isl_aff_get_dim_name(get(), static_cast(type), pos); - std::string tmp(res); - return tmp; } -isl::aff aff::get_div(int pos) const +ast_expr_op_lt::ast_expr_op_lt(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_lt &ast_expr_op_lt::operator=(ast_expr_op_lt obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +isl::ctx ast_expr_op_lt::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); +} + +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_lt &obj) { - auto res = isl_aff_get_div(get(), pos); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::local_space aff::get_domain_local_space() const +// implementations for isl::ast_expr_op_max +ast_expr_op_max::ast_expr_op_max() + : ast_expr_op() {} + +ast_expr_op_max::ast_expr_op_max(const ast_expr_op_max &obj) + : ast_expr_op(obj) { - auto res = isl_aff_get_domain_local_space(get()); - return manage(res); } -isl::space aff::get_domain_space() const +ast_expr_op_max::ast_expr_op_max(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_max &ast_expr_op_max::operator=(ast_expr_op_max obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +isl::ctx ast_expr_op_max::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); +} + +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_max &obj) { - auto res = isl_aff_get_domain_space(get()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -uint32_t aff::get_hash() const +// implementations for isl::ast_expr_op_member +ast_expr_op_member::ast_expr_op_member() + : ast_expr_op() {} + +ast_expr_op_member::ast_expr_op_member(const ast_expr_op_member &obj) + : ast_expr_op(obj) { - auto res = isl_aff_get_hash(get()); - return res; } -isl::local_space aff::get_local_space() const -{ - auto res = isl_aff_get_local_space(get()); - return manage(res); +ast_expr_op_member::ast_expr_op_member(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_member &ast_expr_op_member::operator=(ast_expr_op_member obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +isl::ctx ast_expr_op_member::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::space aff::get_space() const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_member &obj) { - auto res = isl_aff_get_space(get()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::aff aff::gist(isl::set context) const -{ - auto res = isl_aff_gist(copy(), context.release()); - return manage(res); -} +// implementations for isl::ast_expr_op_min +ast_expr_op_min::ast_expr_op_min() + : ast_expr_op() {} -isl::aff aff::gist_params(isl::set context) const +ast_expr_op_min::ast_expr_op_min(const ast_expr_op_min &obj) + : ast_expr_op(obj) { - auto res = isl_aff_gist_params(copy(), context.release()); - return manage(res); } -isl::basic_set aff::gt_basic_set(isl::aff aff2) const -{ - auto res = isl_aff_gt_basic_set(copy(), aff2.release()); - return manage(res); -} +ast_expr_op_min::ast_expr_op_min(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} -isl::set aff::gt_set(isl::aff aff2) const -{ - auto res = isl_aff_gt_set(copy(), aff2.release()); - return manage(res); +ast_expr_op_min &ast_expr_op_min::operator=(ast_expr_op_min obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::aff aff::insert_dims(isl::dim type, unsigned int first, unsigned int n) const -{ - auto res = isl_aff_insert_dims(copy(), static_cast(type), first, n); - return manage(res); +isl::ctx ast_expr_op_min::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -boolean aff::involves_dims(isl::dim type, unsigned int first, unsigned int n) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_min &obj) { - auto res = isl_aff_involves_dims(get(), static_cast(type), first, n); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -boolean aff::involves_locals() const +// implementations for isl::ast_expr_op_minus +ast_expr_op_minus::ast_expr_op_minus() + : ast_expr_op() {} + +ast_expr_op_minus::ast_expr_op_minus(const ast_expr_op_minus &obj) + : ast_expr_op(obj) { - auto res = isl_aff_involves_locals(get()); - return manage(res); } -boolean aff::is_cst() const -{ - auto res = isl_aff_is_cst(get()); - return manage(res); +ast_expr_op_minus::ast_expr_op_minus(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_minus &ast_expr_op_minus::operator=(ast_expr_op_minus obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -boolean aff::is_nan() const -{ - auto res = isl_aff_is_nan(get()); - return manage(res); +isl::ctx ast_expr_op_minus::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::basic_set aff::le_basic_set(isl::aff aff2) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_minus &obj) { - auto res = isl_aff_le_basic_set(copy(), aff2.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::set aff::le_set(isl::aff aff2) const +// implementations for isl::ast_expr_op_mul +ast_expr_op_mul::ast_expr_op_mul() + : ast_expr_op() {} + +ast_expr_op_mul::ast_expr_op_mul(const ast_expr_op_mul &obj) + : ast_expr_op(obj) { - auto res = isl_aff_le_set(copy(), aff2.release()); - return manage(res); } -isl::basic_set aff::lt_basic_set(isl::aff aff2) const -{ - auto res = isl_aff_lt_basic_set(copy(), aff2.release()); - return manage(res); +ast_expr_op_mul::ast_expr_op_mul(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_mul &ast_expr_op_mul::operator=(ast_expr_op_mul obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::set aff::lt_set(isl::aff aff2) const -{ - auto res = isl_aff_lt_set(copy(), aff2.release()); - return manage(res); +isl::ctx ast_expr_op_mul::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::aff aff::mod(isl::val mod) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_mul &obj) { - auto res = isl_aff_mod_val(copy(), mod.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::aff aff::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const +// implementations for isl::ast_expr_op_or +ast_expr_op_or::ast_expr_op_or() + : ast_expr_op() {} + +ast_expr_op_or::ast_expr_op_or(const ast_expr_op_or &obj) + : ast_expr_op(obj) { - auto res = isl_aff_move_dims(copy(), static_cast(dst_type), dst_pos, static_cast(src_type), src_pos, n); - return manage(res); } -isl::aff aff::mul(isl::aff aff2) const -{ - auto res = isl_aff_mul(copy(), aff2.release()); - return manage(res); +ast_expr_op_or::ast_expr_op_or(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_or &ast_expr_op_or::operator=(ast_expr_op_or obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::aff aff::nan_on_domain(isl::local_space ls) -{ - auto res = isl_aff_nan_on_domain(ls.release()); - return manage(res); +isl::ctx ast_expr_op_or::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::aff aff::nan_on_domain_space(isl::space space) +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_or &obj) { - auto res = isl_aff_nan_on_domain_space(space.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::set aff::ne_set(isl::aff aff2) const +// implementations for isl::ast_expr_op_or_else +ast_expr_op_or_else::ast_expr_op_or_else() + : ast_expr_op() {} + +ast_expr_op_or_else::ast_expr_op_or_else(const ast_expr_op_or_else &obj) + : ast_expr_op(obj) { - auto res = isl_aff_ne_set(copy(), aff2.release()); - return manage(res); } -isl::aff aff::neg() const -{ - auto res = isl_aff_neg(copy()); - return manage(res); +ast_expr_op_or_else::ast_expr_op_or_else(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_or_else &ast_expr_op_or_else::operator=(ast_expr_op_or_else obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::basic_set aff::neg_basic_set() const -{ - auto res = isl_aff_neg_basic_set(copy()); - return manage(res); +isl::ctx ast_expr_op_or_else::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::aff aff::param_on_domain_space_id(isl::space space, isl::id id) +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_or_else &obj) { - auto res = isl_aff_param_on_domain_space_id(space.release(), id.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -boolean aff::plain_is_equal(const isl::aff &aff2) const +// implementations for isl::ast_expr_op_pdiv_q +ast_expr_op_pdiv_q::ast_expr_op_pdiv_q() + : ast_expr_op() {} + +ast_expr_op_pdiv_q::ast_expr_op_pdiv_q(const ast_expr_op_pdiv_q &obj) + : ast_expr_op(obj) { - auto res = isl_aff_plain_is_equal(get(), aff2.get()); - return manage(res); } -boolean aff::plain_is_zero() const -{ - auto res = isl_aff_plain_is_zero(get()); - return manage(res); +ast_expr_op_pdiv_q::ast_expr_op_pdiv_q(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_pdiv_q &ast_expr_op_pdiv_q::operator=(ast_expr_op_pdiv_q obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::aff aff::project_domain_on_params() const -{ - auto res = isl_aff_project_domain_on_params(copy()); - return manage(res); +isl::ctx ast_expr_op_pdiv_q::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::aff aff::pullback(isl::multi_aff ma) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_pdiv_q &obj) { - auto res = isl_aff_pullback_multi_aff(copy(), ma.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::aff aff::pullback_aff(isl::aff aff2) const +// implementations for isl::ast_expr_op_pdiv_r +ast_expr_op_pdiv_r::ast_expr_op_pdiv_r() + : ast_expr_op() {} + +ast_expr_op_pdiv_r::ast_expr_op_pdiv_r(const ast_expr_op_pdiv_r &obj) + : ast_expr_op(obj) { - auto res = isl_aff_pullback_aff(copy(), aff2.release()); - return manage(res); } -isl::aff aff::scale(isl::val v) const -{ - auto res = isl_aff_scale_val(copy(), v.release()); - return manage(res); +ast_expr_op_pdiv_r::ast_expr_op_pdiv_r(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_pdiv_r &ast_expr_op_pdiv_r::operator=(ast_expr_op_pdiv_r obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::aff aff::scale_down(isl::val v) const -{ - auto res = isl_aff_scale_down_val(copy(), v.release()); - return manage(res); +isl::ctx ast_expr_op_pdiv_r::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::aff aff::scale_down_ui(unsigned int f) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_pdiv_r &obj) { - auto res = isl_aff_scale_down_ui(copy(), f); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::aff aff::set_coefficient_si(isl::dim type, int pos, int v) const +// implementations for isl::ast_expr_op_select +ast_expr_op_select::ast_expr_op_select() + : ast_expr_op() {} + +ast_expr_op_select::ast_expr_op_select(const ast_expr_op_select &obj) + : ast_expr_op(obj) { - auto res = isl_aff_set_coefficient_si(copy(), static_cast(type), pos, v); - return manage(res); } -isl::aff aff::set_coefficient_val(isl::dim type, int pos, isl::val v) const -{ - auto res = isl_aff_set_coefficient_val(copy(), static_cast(type), pos, v.release()); - return manage(res); +ast_expr_op_select::ast_expr_op_select(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_select &ast_expr_op_select::operator=(ast_expr_op_select obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::aff aff::set_constant_si(int v) const -{ - auto res = isl_aff_set_constant_si(copy(), v); - return manage(res); +isl::ctx ast_expr_op_select::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::aff aff::set_constant_val(isl::val v) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_select &obj) { - auto res = isl_aff_set_constant_val(copy(), v.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::aff aff::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const +// implementations for isl::ast_expr_op_sub +ast_expr_op_sub::ast_expr_op_sub() + : ast_expr_op() {} + +ast_expr_op_sub::ast_expr_op_sub(const ast_expr_op_sub &obj) + : ast_expr_op(obj) { - auto res = isl_aff_set_dim_id(copy(), static_cast(type), pos, id.release()); - return manage(res); } -isl::aff aff::set_tuple_id(isl::dim type, isl::id id) const -{ - auto res = isl_aff_set_tuple_id(copy(), static_cast(type), id.release()); - return manage(res); +ast_expr_op_sub::ast_expr_op_sub(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_sub &ast_expr_op_sub::operator=(ast_expr_op_sub obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::aff aff::sub(isl::aff aff2) const -{ - auto res = isl_aff_sub(copy(), aff2.release()); - return manage(res); +isl::ctx ast_expr_op_sub::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::aff aff::unbind_params_insert_domain(isl::multi_id domain) const +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_sub &obj) { - auto res = isl_aff_unbind_params_insert_domain(copy(), domain.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::aff aff::val_on_domain_space(isl::space space, isl::val val) +// implementations for isl::ast_expr_op_zdiv_r +ast_expr_op_zdiv_r::ast_expr_op_zdiv_r() + : ast_expr_op() {} + +ast_expr_op_zdiv_r::ast_expr_op_zdiv_r(const ast_expr_op_zdiv_r &obj) + : ast_expr_op(obj) { - auto res = isl_aff_val_on_domain_space(space.release(), val.release()); - return manage(res); } -isl::aff aff::var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos) -{ - auto res = isl_aff_var_on_domain(ls.release(), static_cast(type), pos); - return manage(res); +ast_expr_op_zdiv_r::ast_expr_op_zdiv_r(__isl_take isl_ast_expr *ptr) + : ast_expr_op(ptr) {} + +ast_expr_op_zdiv_r &ast_expr_op_zdiv_r::operator=(ast_expr_op_zdiv_r obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::basic_set aff::zero_basic_set() const -{ - auto res = isl_aff_zero_basic_set(copy()); - return manage(res); +isl::ctx ast_expr_op_zdiv_r::ctx() const { + return isl::ctx(isl_ast_expr_get_ctx(ptr)); } -isl::aff aff::zero_on_domain(isl::space space) +inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_zdiv_r &obj) { - auto res = isl_aff_zero_on_domain_space(space.release()); - return manage(res); + char *str = isl_ast_expr_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -// implementations for isl::aff_list -aff_list manage(__isl_take isl_aff_list *ptr) { - return aff_list(ptr); +// implementations for isl::ast_node +ast_node manage(__isl_take isl_ast_node *ptr) { + return ast_node(ptr); } -aff_list manage_copy(__isl_keep isl_aff_list *ptr) { - ptr = isl_aff_list_copy(ptr); - return aff_list(ptr); +ast_node manage_copy(__isl_keep isl_ast_node *ptr) { + ptr = isl_ast_node_copy(ptr); + return ast_node(ptr); } -aff_list::aff_list() +ast_node::ast_node() : ptr(nullptr) {} -aff_list::aff_list(const aff_list &obj) +ast_node::ast_node(const ast_node &obj) : ptr(nullptr) { ptr = obj.copy(); } - -aff_list::aff_list(__isl_take isl_aff_list *ptr) +ast_node::ast_node(__isl_take isl_ast_node *ptr) : ptr(ptr) {} - -aff_list &aff_list::operator=(aff_list obj) { +ast_node &ast_node::operator=(ast_node obj) { std::swap(this->ptr, obj.ptr); return *this; } -aff_list::~aff_list() { +ast_node::~ast_node() { if (ptr) - isl_aff_list_free(ptr); + isl_ast_node_free(ptr); } -__isl_give isl_aff_list *aff_list::copy() const & { - return isl_aff_list_copy(ptr); +__isl_give isl_ast_node *ast_node::copy() const & { + return isl_ast_node_copy(ptr); } -__isl_keep isl_aff_list *aff_list::get() const { +__isl_keep isl_ast_node *ast_node::get() const { return ptr; } -__isl_give isl_aff_list *aff_list::release() { - isl_aff_list *tmp = ptr; +__isl_give isl_ast_node *ast_node::release() { + isl_ast_node *tmp = ptr; ptr = nullptr; return tmp; } -bool aff_list::is_null() const { +bool ast_node::is_null() const { return ptr == nullptr; } - -isl::ctx aff_list::ctx() const { - return isl::ctx(isl_aff_list_get_ctx(ptr)); -} - -void aff_list::dump() const { - isl_aff_list_dump(get()); -} - - -isl::aff_list aff_list::add(isl::aff el) const -{ - auto res = isl_aff_list_add(copy(), el.release()); - return manage(res); -} - -isl::aff_list aff_list::alloc(isl::ctx ctx, int n) -{ - auto res = isl_aff_list_alloc(ctx.release(), n); - return manage(res); -} - -isl::aff_list aff_list::clear() const -{ - auto res = isl_aff_list_clear(copy()); - return manage(res); -} - -isl::aff_list aff_list::concat(isl::aff_list list2) const -{ - auto res = isl_aff_list_concat(copy(), list2.release()); - return manage(res); -} - -isl::aff_list aff_list::drop(unsigned int first, unsigned int n) const -{ - auto res = isl_aff_list_drop(copy(), first, n); - return manage(res); -} - -stat aff_list::foreach(const std::function &fn) const +template +boolean ast_node::isa_type(T subtype) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_aff *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_aff_list_foreach(get(), fn_lambda, &fn_data); - return manage(res); + if (is_null()) + return boolean(); + return isl_ast_node_get_type(get()) == subtype; } - -isl::aff_list aff_list::from_aff(isl::aff el) +template +boolean ast_node::isa() const { - auto res = isl_aff_list_from_aff(el.release()); - return manage(res); + return isa_type(T::type); } - -isl::aff aff_list::get_aff(int index) const +template +T ast_node::as() const { - auto res = isl_aff_list_get_aff(get(), index); - return manage(res); + if (isa().is_false()) + isl_die(ctx().get(), isl_error_invalid, "not an object of the requested subtype", return T()); + return T(copy()); } -isl::aff aff_list::get_at(int index) const -{ - auto res = isl_aff_list_get_at(get(), index); - return manage(res); +isl::ctx ast_node::ctx() const { + return isl::ctx(isl_ast_node_get_ctx(ptr)); } -isl::aff_list aff_list::insert(unsigned int pos, isl::aff el) const +isl::id ast_node::annotation() const { - auto res = isl_aff_list_insert(copy(), pos, el.release()); + auto res = isl_ast_node_get_annotation(get()); return manage(res); } -isl_size aff_list::n_aff() const +isl::id ast_node::get_annotation() const { - auto res = isl_aff_list_n_aff(get()); - return res; + return annotation(); } -isl::aff_list aff_list::reverse() const +std::string ast_node::to_C_str() const { - auto res = isl_aff_list_reverse(copy()); - return manage(res); + auto res = isl_ast_node_to_C_str(get()); + std::string tmp(res); + free(res); + return tmp; } -isl::aff_list aff_list::set_aff(int index, isl::aff el) const +isl::ast_node_list ast_node::to_list() const { - auto res = isl_aff_list_set_aff(copy(), index, el.release()); + auto res = isl_ast_node_to_list(copy()); return manage(res); } -isl_size aff_list::size() const +inline std::ostream &operator<<(std::ostream &os, const ast_node &obj) { - auto res = isl_aff_list_size(get()); - return res; + char *str = isl_ast_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::aff_list aff_list::swap(unsigned int pos1, unsigned int pos2) const +// implementations for isl::ast_node_block +ast_node_block::ast_node_block() + : ast_node() {} + +ast_node_block::ast_node_block(const ast_node_block &obj) + : ast_node(obj) { - auto res = isl_aff_list_swap(copy(), pos1, pos2); - return manage(res); } -// implementations for isl::ast_build -ast_build manage(__isl_take isl_ast_build *ptr) { - return ast_build(ptr); +ast_node_block::ast_node_block(__isl_take isl_ast_node *ptr) + : ast_node(ptr) {} + +ast_node_block &ast_node_block::operator=(ast_node_block obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -ast_build manage_copy(__isl_keep isl_ast_build *ptr) { - ptr = isl_ast_build_copy(ptr); - return ast_build(ptr); + +isl::ctx ast_node_block::ctx() const { + return isl::ctx(isl_ast_node_get_ctx(ptr)); } -ast_build::ast_build() - : ptr(nullptr) {} +isl::ast_node_list ast_node_block::children() const +{ + auto res = isl_ast_node_block_get_children(get()); + return manage(res); +} -ast_build::ast_build(const ast_build &obj) - : ptr(nullptr) +isl::ast_node_list ast_node_block::get_children() const { - ptr = obj.copy(); + return children(); } +inline std::ostream &operator<<(std::ostream &os, const ast_node_block &obj) +{ + char *str = isl_ast_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} -ast_build::ast_build(__isl_take isl_ast_build *ptr) - : ptr(ptr) {} +// implementations for isl::ast_node_for +ast_node_for::ast_node_for() + : ast_node() {} -ast_build::ast_build(isl::ctx ctx) +ast_node_for::ast_node_for(const ast_node_for &obj) + : ast_node(obj) { - auto res = isl_ast_build_alloc(ctx.release()); - ptr = res; } -ast_build &ast_build::operator=(ast_build obj) { +ast_node_for::ast_node_for(__isl_take isl_ast_node *ptr) + : ast_node(ptr) {} + +ast_node_for &ast_node_for::operator=(ast_node_for obj) { std::swap(this->ptr, obj.ptr); return *this; } -ast_build::~ast_build() { - if (ptr) - isl_ast_build_free(ptr); +isl::ctx ast_node_for::ctx() const { + return isl::ctx(isl_ast_node_get_ctx(ptr)); } -__isl_give isl_ast_build *ast_build::copy() const & { - return isl_ast_build_copy(ptr); +isl::ast_node ast_node_for::body() const +{ + auto res = isl_ast_node_for_get_body(get()); + return manage(res); } -__isl_keep isl_ast_build *ast_build::get() const { - return ptr; +isl::ast_node ast_node_for::get_body() const +{ + return body(); } -__isl_give isl_ast_build *ast_build::release() { - isl_ast_build *tmp = ptr; - ptr = nullptr; - return tmp; +isl::ast_expr ast_node_for::cond() const +{ + auto res = isl_ast_node_for_get_cond(get()); + return manage(res); } -bool ast_build::is_null() const { - return ptr == nullptr; +isl::ast_expr ast_node_for::get_cond() const +{ + return cond(); } - -isl::ctx ast_build::ctx() const { - return isl::ctx(isl_ast_build_get_ctx(ptr)); +isl::ast_expr ast_node_for::inc() const +{ + auto res = isl_ast_node_for_get_inc(get()); + return manage(res); } +isl::ast_expr ast_node_for::get_inc() const +{ + return inc(); +} -isl::ast_expr ast_build::access_from(isl::multi_pw_aff mpa) const +isl::ast_expr ast_node_for::init() const { - auto res = isl_ast_build_access_from_multi_pw_aff(get(), mpa.release()); + auto res = isl_ast_node_for_get_init(get()); return manage(res); } -isl::ast_expr ast_build::access_from(isl::pw_multi_aff pma) const +isl::ast_expr ast_node_for::get_init() const { - auto res = isl_ast_build_access_from_pw_multi_aff(get(), pma.release()); - return manage(res); + return init(); } -isl::ast_node ast_build::ast_from_schedule(isl::union_map schedule) const +boolean ast_node_for::is_degenerate() const { - auto res = isl_ast_build_ast_from_schedule(get(), schedule.release()); + auto res = isl_ast_node_for_is_degenerate(get()); return manage(res); } -isl::ast_expr ast_build::call_from(isl::multi_pw_aff mpa) const +isl::ast_expr ast_node_for::iterator() const { - auto res = isl_ast_build_call_from_multi_pw_aff(get(), mpa.release()); + auto res = isl_ast_node_for_get_iterator(get()); return manage(res); } -isl::ast_expr ast_build::call_from(isl::pw_multi_aff pma) const +isl::ast_expr ast_node_for::get_iterator() const { - auto res = isl_ast_build_call_from_pw_multi_aff(get(), pma.release()); - return manage(res); + return iterator(); } -isl::ast_expr ast_build::expr_from(isl::pw_aff pa) const +inline std::ostream &operator<<(std::ostream &os, const ast_node_for &obj) { - auto res = isl_ast_build_expr_from_pw_aff(get(), pa.release()); - return manage(res); + char *str = isl_ast_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::ast_expr ast_build::expr_from(isl::set set) const +// implementations for isl::ast_node_if +ast_node_if::ast_node_if() + : ast_node() {} + +ast_node_if::ast_node_if(const ast_node_if &obj) + : ast_node(obj) { - auto res = isl_ast_build_expr_from_set(get(), set.release()); - return manage(res); } -isl::ast_build ast_build::from_context(isl::set set) +ast_node_if::ast_node_if(__isl_take isl_ast_node *ptr) + : ast_node(ptr) {} + +ast_node_if &ast_node_if::operator=(ast_node_if obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +isl::ctx ast_node_if::ctx() const { + return isl::ctx(isl_ast_node_get_ctx(ptr)); +} + +isl::ast_expr ast_node_if::cond() const { - auto res = isl_ast_build_from_context(set.release()); + auto res = isl_ast_node_if_get_cond(get()); return manage(res); } -isl::union_map ast_build::get_schedule() const +isl::ast_expr ast_node_if::get_cond() const { - auto res = isl_ast_build_get_schedule(get()); - return manage(res); + return cond(); } -isl::space ast_build::get_schedule_space() const +isl::ast_node ast_node_if::else_node() const { - auto res = isl_ast_build_get_schedule_space(get()); + auto res = isl_ast_node_if_get_else_node(get()); return manage(res); } -isl::ast_node ast_build::node_from(isl::schedule schedule) const +isl::ast_node ast_node_if::get_else_node() const { - auto res = isl_ast_build_node_from_schedule(get(), schedule.release()); - return manage(res); + return else_node(); } -isl::ast_node ast_build::node_from_schedule_map(isl::union_map schedule) const +boolean ast_node_if::has_else_node() const { - auto res = isl_ast_build_node_from_schedule_map(get(), schedule.release()); + auto res = isl_ast_node_if_has_else_node(get()); return manage(res); } -isl::ast_build ast_build::restrict(isl::set set) const +isl::ast_node ast_node_if::then_node() const { - auto res = isl_ast_build_restrict(copy(), set.release()); + auto res = isl_ast_node_if_get_then_node(get()); return manage(res); } -// implementations for isl::ast_expr -ast_expr manage(__isl_take isl_ast_expr *ptr) { - return ast_expr(ptr); +isl::ast_node ast_node_if::get_then_node() const +{ + return then_node(); } -ast_expr manage_copy(__isl_keep isl_ast_expr *ptr) { - ptr = isl_ast_expr_copy(ptr); - return ast_expr(ptr); + +inline std::ostream &operator<<(std::ostream &os, const ast_node_if &obj) +{ + char *str = isl_ast_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -ast_expr::ast_expr() +// implementations for isl::ast_node_list +ast_node_list manage(__isl_take isl_ast_node_list *ptr) { + return ast_node_list(ptr); +} +ast_node_list manage_copy(__isl_keep isl_ast_node_list *ptr) { + ptr = isl_ast_node_list_copy(ptr); + return ast_node_list(ptr); +} + +ast_node_list::ast_node_list() : ptr(nullptr) {} -ast_expr::ast_expr(const ast_expr &obj) +ast_node_list::ast_node_list(const ast_node_list &obj) : ptr(nullptr) { ptr = obj.copy(); } - -ast_expr::ast_expr(__isl_take isl_ast_expr *ptr) +ast_node_list::ast_node_list(__isl_take isl_ast_node_list *ptr) : ptr(ptr) {} +ast_node_list::ast_node_list(isl::ctx ctx, int n) +{ + auto res = isl_ast_node_list_alloc(ctx.release(), n); + ptr = res; +} + +ast_node_list::ast_node_list(isl::ast_node el) +{ + auto res = isl_ast_node_list_from_ast_node(el.release()); + ptr = res; +} -ast_expr &ast_expr::operator=(ast_expr obj) { +ast_node_list &ast_node_list::operator=(ast_node_list obj) { std::swap(this->ptr, obj.ptr); return *this; } -ast_expr::~ast_expr() { +ast_node_list::~ast_node_list() { if (ptr) - isl_ast_expr_free(ptr); + isl_ast_node_list_free(ptr); } -__isl_give isl_ast_expr *ast_expr::copy() const & { - return isl_ast_expr_copy(ptr); +__isl_give isl_ast_node_list *ast_node_list::copy() const & { + return isl_ast_node_list_copy(ptr); } -__isl_keep isl_ast_expr *ast_expr::get() const { +__isl_keep isl_ast_node_list *ast_node_list::get() const { return ptr; } -__isl_give isl_ast_expr *ast_expr::release() { - isl_ast_expr *tmp = ptr; +__isl_give isl_ast_node_list *ast_node_list::release() { + isl_ast_node_list *tmp = ptr; ptr = nullptr; return tmp; } -bool ast_expr::is_null() const { +bool ast_node_list::is_null() const { return ptr == nullptr; } - -isl::ctx ast_expr::ctx() const { - return isl::ctx(isl_ast_expr_get_ctx(ptr)); -} - -void ast_expr::dump() const { - isl_ast_expr_dump(get()); -} - - -isl::ast_expr ast_expr::access(isl::ast_expr_list indices) const -{ - auto res = isl_ast_expr_access(copy(), indices.release()); - return manage(res); -} - -isl::ast_expr ast_expr::add(isl::ast_expr expr2) const -{ - auto res = isl_ast_expr_add(copy(), expr2.release()); - return manage(res); -} - -isl::ast_expr ast_expr::address_of() const -{ - auto res = isl_ast_expr_address_of(copy()); - return manage(res); +isl::ctx ast_node_list::ctx() const { + return isl::ctx(isl_ast_node_list_get_ctx(ptr)); } -isl::ast_expr ast_expr::call(isl::ast_expr_list arguments) const +isl::ast_node_list ast_node_list::add(isl::ast_node el) const { - auto res = isl_ast_expr_call(copy(), arguments.release()); + auto res = isl_ast_node_list_add(copy(), el.release()); return manage(res); } -isl::ast_expr ast_expr::div(isl::ast_expr expr2) const +isl::ast_node ast_node_list::at(int index) const { - auto res = isl_ast_expr_div(copy(), expr2.release()); + auto res = isl_ast_node_list_get_at(get(), index); return manage(res); } -isl::ast_expr ast_expr::eq(isl::ast_expr expr2) const +isl::ast_node ast_node_list::get_at(int index) const { - auto res = isl_ast_expr_eq(copy(), expr2.release()); - return manage(res); + return at(index); } -isl::ast_expr ast_expr::from_id(isl::id id) +isl::ast_node_list ast_node_list::clear() const { - auto res = isl_ast_expr_from_id(id.release()); + auto res = isl_ast_node_list_clear(copy()); return manage(res); } -isl::ast_expr ast_expr::from_val(isl::val v) +isl::ast_node_list ast_node_list::concat(isl::ast_node_list list2) const { - auto res = isl_ast_expr_from_val(v.release()); + auto res = isl_ast_node_list_concat(copy(), list2.release()); return manage(res); } -isl::ast_expr ast_expr::ge(isl::ast_expr expr2) const +isl::ast_node_list ast_node_list::drop(unsigned int first, unsigned int n) const { - auto res = isl_ast_expr_ge(copy(), expr2.release()); + auto res = isl_ast_node_list_drop(copy(), first, n); return manage(res); } -isl::id ast_expr::get_id() const +stat ast_node_list::foreach(const std::function &fn) const { - auto res = isl_ast_expr_get_id(get()); + struct fn_data { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_ast_node *arg_0, void *arg_1) -> isl_stat { + auto *data = static_cast(arg_1); + auto ret = (data->func)(manage(arg_0)); + return ret.release(); + }; + auto res = isl_ast_node_list_foreach(get(), fn_lambda, &fn_data); return manage(res); } -isl::ast_expr ast_expr::get_op_arg(int pos) const +isl::ast_node_list ast_node_list::insert(unsigned int pos, isl::ast_node el) const { - auto res = isl_ast_expr_get_op_arg(get(), pos); + auto res = isl_ast_node_list_insert(copy(), pos, el.release()); return manage(res); } -isl_size ast_expr::get_op_n_arg() const -{ - auto res = isl_ast_expr_get_op_n_arg(get()); - return res; -} - -isl::val ast_expr::get_val() const +class size ast_node_list::size() const { - auto res = isl_ast_expr_get_val(get()); + auto res = isl_ast_node_list_size(get()); return manage(res); } -isl::ast_expr ast_expr::gt(isl::ast_expr expr2) const +inline std::ostream &operator<<(std::ostream &os, const ast_node_list &obj) { - auto res = isl_ast_expr_gt(copy(), expr2.release()); - return manage(res); + char *str = isl_ast_node_list_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::id ast_expr::id_get_id() const -{ - auto res = isl_ast_expr_id_get_id(get()); - return manage(res); -} +// implementations for isl::ast_node_mark +ast_node_mark::ast_node_mark() + : ast_node() {} -isl::val ast_expr::int_get_val() const +ast_node_mark::ast_node_mark(const ast_node_mark &obj) + : ast_node(obj) { - auto res = isl_ast_expr_int_get_val(get()); - return manage(res); } -boolean ast_expr::is_equal(const isl::ast_expr &expr2) const -{ - auto res = isl_ast_expr_is_equal(get(), expr2.get()); - return manage(res); -} +ast_node_mark::ast_node_mark(__isl_take isl_ast_node *ptr) + : ast_node(ptr) {} -isl::ast_expr ast_expr::le(isl::ast_expr expr2) const -{ - auto res = isl_ast_expr_le(copy(), expr2.release()); - return manage(res); +ast_node_mark &ast_node_mark::operator=(ast_node_mark obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::ast_expr ast_expr::lt(isl::ast_expr expr2) const -{ - auto res = isl_ast_expr_lt(copy(), expr2.release()); - return manage(res); +isl::ctx ast_node_mark::ctx() const { + return isl::ctx(isl_ast_node_get_ctx(ptr)); } -isl::ast_expr ast_expr::mul(isl::ast_expr expr2) const +isl::id ast_node_mark::id() const { - auto res = isl_ast_expr_mul(copy(), expr2.release()); + auto res = isl_ast_node_mark_get_id(get()); return manage(res); } -isl::ast_expr ast_expr::neg() const +isl::id ast_node_mark::get_id() const { - auto res = isl_ast_expr_neg(copy()); - return manage(res); + return id(); } -isl::ast_expr ast_expr::op_get_arg(int pos) const +isl::ast_node ast_node_mark::node() const { - auto res = isl_ast_expr_op_get_arg(get(), pos); + auto res = isl_ast_node_mark_get_node(get()); return manage(res); } -isl_size ast_expr::op_get_n_arg() const +isl::ast_node ast_node_mark::get_node() const { - auto res = isl_ast_expr_op_get_n_arg(get()); - return res; + return node(); } -isl::ast_expr ast_expr::pdiv_q(isl::ast_expr expr2) const +inline std::ostream &operator<<(std::ostream &os, const ast_node_mark &obj) { - auto res = isl_ast_expr_pdiv_q(copy(), expr2.release()); - return manage(res); + char *str = isl_ast_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::ast_expr ast_expr::pdiv_r(isl::ast_expr expr2) const +// implementations for isl::ast_node_user +ast_node_user::ast_node_user() + : ast_node() {} + +ast_node_user::ast_node_user(const ast_node_user &obj) + : ast_node(obj) { - auto res = isl_ast_expr_pdiv_r(copy(), expr2.release()); - return manage(res); } -isl::ast_expr ast_expr::set_op_arg(int pos, isl::ast_expr arg) const -{ - auto res = isl_ast_expr_set_op_arg(copy(), pos, arg.release()); - return manage(res); +ast_node_user::ast_node_user(__isl_take isl_ast_node *ptr) + : ast_node(ptr) {} + +ast_node_user &ast_node_user::operator=(ast_node_user obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +isl::ctx ast_node_user::ctx() const { + return isl::ctx(isl_ast_node_get_ctx(ptr)); } -isl::ast_expr ast_expr::sub(isl::ast_expr expr2) const +isl::ast_expr ast_node_user::expr() const { - auto res = isl_ast_expr_sub(copy(), expr2.release()); + auto res = isl_ast_node_user_get_expr(get()); return manage(res); } -isl::ast_expr ast_expr::substitute_ids(isl::id_to_ast_expr id2expr) const +isl::ast_expr ast_node_user::get_expr() const { - auto res = isl_ast_expr_substitute_ids(copy(), id2expr.release()); - return manage(res); + return expr(); } -std::string ast_expr::to_C_str() const +inline std::ostream &operator<<(std::ostream &os, const ast_node_user &obj) { - auto res = isl_ast_expr_to_C_str(get()); - std::string tmp(res); - free(res); - return tmp; + char *str = isl_ast_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -// implementations for isl::ast_expr_list -ast_expr_list manage(__isl_take isl_ast_expr_list *ptr) { - return ast_expr_list(ptr); +// implementations for isl::basic_map +basic_map manage(__isl_take isl_basic_map *ptr) { + return basic_map(ptr); } -ast_expr_list manage_copy(__isl_keep isl_ast_expr_list *ptr) { - ptr = isl_ast_expr_list_copy(ptr); - return ast_expr_list(ptr); +basic_map manage_copy(__isl_keep isl_basic_map *ptr) { + ptr = isl_basic_map_copy(ptr); + return basic_map(ptr); } -ast_expr_list::ast_expr_list() +basic_map::basic_map() : ptr(nullptr) {} -ast_expr_list::ast_expr_list(const ast_expr_list &obj) +basic_map::basic_map(const basic_map &obj) : ptr(nullptr) { ptr = obj.copy(); } - -ast_expr_list::ast_expr_list(__isl_take isl_ast_expr_list *ptr) +basic_map::basic_map(__isl_take isl_basic_map *ptr) : ptr(ptr) {} +basic_map::basic_map(isl::ctx ctx, const std::string &str) +{ + auto res = isl_basic_map_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} -ast_expr_list &ast_expr_list::operator=(ast_expr_list obj) { +basic_map &basic_map::operator=(basic_map obj) { std::swap(this->ptr, obj.ptr); return *this; } -ast_expr_list::~ast_expr_list() { +basic_map::~basic_map() { if (ptr) - isl_ast_expr_list_free(ptr); + isl_basic_map_free(ptr); } -__isl_give isl_ast_expr_list *ast_expr_list::copy() const & { - return isl_ast_expr_list_copy(ptr); +__isl_give isl_basic_map *basic_map::copy() const & { + return isl_basic_map_copy(ptr); } -__isl_keep isl_ast_expr_list *ast_expr_list::get() const { +__isl_keep isl_basic_map *basic_map::get() const { return ptr; } -__isl_give isl_ast_expr_list *ast_expr_list::release() { - isl_ast_expr_list *tmp = ptr; +__isl_give isl_basic_map *basic_map::release() { + isl_basic_map *tmp = ptr; ptr = nullptr; return tmp; } -bool ast_expr_list::is_null() const { +bool basic_map::is_null() const { return ptr == nullptr; } - -isl::ctx ast_expr_list::ctx() const { - return isl::ctx(isl_ast_expr_list_get_ctx(ptr)); -} - -void ast_expr_list::dump() const { - isl_ast_expr_list_dump(get()); +isl::ctx basic_map::ctx() const { + return isl::ctx(isl_basic_map_get_ctx(ptr)); } - -isl::ast_expr_list ast_expr_list::add(isl::ast_expr el) const +isl::map basic_map::add_constraint(const isl::constraint &constraint) const { - auto res = isl_ast_expr_list_add(copy(), el.release()); - return manage(res); + return isl::map(*this).add_constraint(constraint); } -isl::ast_expr_list ast_expr_list::alloc(isl::ctx ctx, int n) +isl::map basic_map::add_dims(isl::dim type, unsigned int n) const { - auto res = isl_ast_expr_list_alloc(ctx.release(), n); - return manage(res); + return isl::map(*this).add_dims(type, n); } -isl::ast_expr_list ast_expr_list::clear() const +isl::basic_map basic_map::affine_hull() const { - auto res = isl_ast_expr_list_clear(copy()); + auto res = isl_basic_map_affine_hull(copy()); return manage(res); } -isl::ast_expr_list ast_expr_list::concat(isl::ast_expr_list list2) const +isl::map basic_map::align_params(const isl::space &model) const { - auto res = isl_ast_expr_list_concat(copy(), list2.release()); - return manage(res); + return isl::map(*this).align_params(model); } -isl::ast_expr_list ast_expr_list::drop(unsigned int first, unsigned int n) const +isl::basic_map basic_map::apply_domain(isl::basic_map bmap2) const { - auto res = isl_ast_expr_list_drop(copy(), first, n); + auto res = isl_basic_map_apply_domain(copy(), bmap2.release()); return manage(res); } -stat ast_expr_list::foreach(const std::function &fn) const +isl::map basic_map::apply_domain(const isl::map &map2) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_ast_expr *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_ast_expr_list_foreach(get(), fn_lambda, &fn_data); - return manage(res); + return isl::map(*this).apply_domain(map2); } -isl::ast_expr_list ast_expr_list::from_ast_expr(isl::ast_expr el) +isl::union_map basic_map::apply_domain(const isl::union_map &umap2) const { - auto res = isl_ast_expr_list_from_ast_expr(el.release()); - return manage(res); + return isl::map(*this).apply_domain(umap2); } -isl::ast_expr ast_expr_list::get_ast_expr(int index) const +isl::basic_map basic_map::apply_range(isl::basic_map bmap2) const { - auto res = isl_ast_expr_list_get_ast_expr(get(), index); + auto res = isl_basic_map_apply_range(copy(), bmap2.release()); return manage(res); } -isl::ast_expr ast_expr_list::get_at(int index) const +isl::map basic_map::apply_range(const isl::map &map2) const { - auto res = isl_ast_expr_list_get_at(get(), index); - return manage(res); + return isl::map(*this).apply_range(map2); } -isl::ast_expr_list ast_expr_list::insert(unsigned int pos, isl::ast_expr el) const +isl::union_map basic_map::apply_range(const isl::union_map &umap2) const { - auto res = isl_ast_expr_list_insert(copy(), pos, el.release()); - return manage(res); + return isl::map(*this).apply_range(umap2); } -isl_size ast_expr_list::n_ast_expr() const +isl::map basic_map::as_map() const { - auto res = isl_ast_expr_list_n_ast_expr(get()); - return res; + return isl::map(*this).as_map(); } -isl::ast_expr_list ast_expr_list::reverse() const +isl::multi_union_pw_aff basic_map::as_multi_union_pw_aff() const { - auto res = isl_ast_expr_list_reverse(copy()); - return manage(res); + return isl::map(*this).as_multi_union_pw_aff(); } -isl::ast_expr_list ast_expr_list::set_ast_expr(int index, isl::ast_expr el) const +isl::pw_multi_aff basic_map::as_pw_multi_aff() const { - auto res = isl_ast_expr_list_set_ast_expr(copy(), index, el.release()); - return manage(res); + return isl::map(*this).as_pw_multi_aff(); } -isl_size ast_expr_list::size() const +isl::union_pw_multi_aff basic_map::as_union_pw_multi_aff() const { - auto res = isl_ast_expr_list_size(get()); - return res; + return isl::map(*this).as_union_pw_multi_aff(); } -isl::ast_expr_list ast_expr_list::swap(unsigned int pos1, unsigned int pos2) const +isl::basic_map_list basic_map::basic_map_list() const { - auto res = isl_ast_expr_list_swap(copy(), pos1, pos2); - return manage(res); -} - -// implementations for isl::ast_node -ast_node manage(__isl_take isl_ast_node *ptr) { - return ast_node(ptr); -} -ast_node manage_copy(__isl_keep isl_ast_node *ptr) { - ptr = isl_ast_node_copy(ptr); - return ast_node(ptr); + return isl::map(*this).basic_map_list(); } -ast_node::ast_node() - : ptr(nullptr) {} - -ast_node::ast_node(const ast_node &obj) - : ptr(nullptr) +isl::set basic_map::bind_domain(const isl::multi_id &tuple) const { - ptr = obj.copy(); -} - - -ast_node::ast_node(__isl_take isl_ast_node *ptr) - : ptr(ptr) {} - - -ast_node &ast_node::operator=(ast_node obj) { - std::swap(this->ptr, obj.ptr); - return *this; -} - -ast_node::~ast_node() { - if (ptr) - isl_ast_node_free(ptr); -} - -__isl_give isl_ast_node *ast_node::copy() const & { - return isl_ast_node_copy(ptr); + return isl::map(*this).bind_domain(tuple); } -__isl_keep isl_ast_node *ast_node::get() const { - return ptr; +isl::set basic_map::bind_range(const isl::multi_id &tuple) const +{ + return isl::map(*this).bind_range(tuple); } -__isl_give isl_ast_node *ast_node::release() { - isl_ast_node *tmp = ptr; - ptr = nullptr; - return tmp; +boolean basic_map::can_curry() const +{ + return isl::map(*this).can_curry(); } -bool ast_node::is_null() const { - return ptr == nullptr; +isl::map basic_map::coalesce() const +{ + return isl::map(*this).coalesce(); } - -isl::ctx ast_node::ctx() const { - return isl::ctx(isl_ast_node_get_ctx(ptr)); +isl::map basic_map::complement() const +{ + return isl::map(*this).complement(); } -void ast_node::dump() const { - isl_ast_node_dump(get()); +isl::union_map basic_map::compute_divs() const +{ + return isl::map(*this).compute_divs(); } - -isl::ast_node ast_node::alloc_user(isl::ast_expr expr) +isl::map basic_map::curry() const { - auto res = isl_ast_node_alloc_user(expr.release()); - return manage(res); + return isl::map(*this).curry(); } -isl::ast_node_list ast_node::block_get_children() const +isl::basic_set basic_map::deltas() const { - auto res = isl_ast_node_block_get_children(get()); + auto res = isl_basic_map_deltas(copy()); return manage(res); } -isl::ast_node ast_node::for_get_body() const +isl::basic_map basic_map::detect_equalities() const { - auto res = isl_ast_node_for_get_body(get()); + auto res = isl_basic_map_detect_equalities(copy()); return manage(res); } -isl::ast_expr ast_node::for_get_cond() const +class size basic_map::dim(isl::dim type) const { - auto res = isl_ast_node_for_get_cond(get()); - return manage(res); + return isl::map(*this).dim(type); } -isl::ast_expr ast_node::for_get_inc() const +isl::pw_aff basic_map::dim_max(int pos) const { - auto res = isl_ast_node_for_get_inc(get()); - return manage(res); + return isl::map(*this).dim_max(pos); } -isl::ast_expr ast_node::for_get_init() const +isl::pw_aff basic_map::dim_min(int pos) const { - auto res = isl_ast_node_for_get_init(get()); - return manage(res); + return isl::map(*this).dim_min(pos); } -isl::ast_expr ast_node::for_get_iterator() const +isl::basic_set basic_map::domain() const { - auto res = isl_ast_node_for_get_iterator(get()); + auto res = isl_basic_map_domain(copy()); return manage(res); } -boolean ast_node::for_is_degenerate() const +isl::map basic_map::domain_factor_domain() const { - auto res = isl_ast_node_for_is_degenerate(get()); - return manage(res); + return isl::map(*this).domain_factor_domain(); } -isl::id ast_node::get_annotation() const +isl::map basic_map::domain_factor_range() const { - auto res = isl_ast_node_get_annotation(get()); - return manage(res); + return isl::map(*this).domain_factor_range(); } -isl::ast_expr ast_node::if_get_cond() const +isl::map basic_map::domain_map() const { - auto res = isl_ast_node_if_get_cond(get()); - return manage(res); + return isl::map(*this).domain_map(); } -isl::ast_node ast_node::if_get_else() const +isl::union_pw_multi_aff basic_map::domain_map_union_pw_multi_aff() const { - auto res = isl_ast_node_if_get_else(get()); - return manage(res); + return isl::map(*this).domain_map_union_pw_multi_aff(); } -isl::ast_node ast_node::if_get_else_node() const +isl::map basic_map::domain_product(const isl::map &map2) const { - auto res = isl_ast_node_if_get_else_node(get()); - return manage(res); + return isl::map(*this).domain_product(map2); } -isl::ast_node ast_node::if_get_then() const +isl::union_map basic_map::domain_product(const isl::union_map &umap2) const { - auto res = isl_ast_node_if_get_then(get()); - return manage(res); + return isl::map(*this).domain_product(umap2); } -isl::ast_node ast_node::if_get_then_node() const +class size basic_map::domain_tuple_dim() const { - auto res = isl_ast_node_if_get_then_node(get()); - return manage(res); + return isl::map(*this).domain_tuple_dim(); } -boolean ast_node::if_has_else() const +isl::id basic_map::domain_tuple_id() const { - auto res = isl_ast_node_if_has_else(get()); - return manage(res); + return isl::map(*this).domain_tuple_id(); } -boolean ast_node::if_has_else_node() const +isl::map basic_map::eq_at(const isl::multi_pw_aff &mpa) const { - auto res = isl_ast_node_if_has_else_node(get()); - return manage(res); + return isl::map(*this).eq_at(mpa); } -isl::id ast_node::mark_get_id() const +isl::union_map basic_map::eq_at(const isl::multi_union_pw_aff &mupa) const { - auto res = isl_ast_node_mark_get_id(get()); - return manage(res); + return isl::map(*this).eq_at(mupa); } -isl::ast_node ast_node::mark_get_node() const +isl::basic_map basic_map::equal(isl::space space, unsigned int n_equal) { - auto res = isl_ast_node_mark_get_node(get()); + auto res = isl_basic_map_equal(space.release(), n_equal); return manage(res); } -isl::ast_node ast_node::set_annotation(isl::id annotation) const +isl::basic_map basic_map::equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const { - auto res = isl_ast_node_set_annotation(copy(), annotation.release()); + auto res = isl_basic_map_equate(copy(), static_cast(type1), pos1, static_cast(type2), pos2); return manage(res); } -std::string ast_node::to_C_str() const +boolean basic_map::every_map(const std::function &test) const { - auto res = isl_ast_node_to_C_str(get()); - std::string tmp(res); - free(res); - return tmp; + return isl::map(*this).every_map(test); } -isl::ast_expr ast_node::user_get_expr() const +isl::map basic_map::extract_map(const isl::space &space) const { - auto res = isl_ast_node_user_get_expr(get()); - return manage(res); -} - -// implementations for isl::ast_node_list -ast_node_list manage(__isl_take isl_ast_node_list *ptr) { - return ast_node_list(ptr); -} -ast_node_list manage_copy(__isl_keep isl_ast_node_list *ptr) { - ptr = isl_ast_node_list_copy(ptr); - return ast_node_list(ptr); + return isl::map(*this).extract_map(space); } -ast_node_list::ast_node_list() - : ptr(nullptr) {} - -ast_node_list::ast_node_list(const ast_node_list &obj) - : ptr(nullptr) +isl::map basic_map::factor_domain() const { - ptr = obj.copy(); -} - - -ast_node_list::ast_node_list(__isl_take isl_ast_node_list *ptr) - : ptr(ptr) {} - - -ast_node_list &ast_node_list::operator=(ast_node_list obj) { - std::swap(this->ptr, obj.ptr); - return *this; -} - -ast_node_list::~ast_node_list() { - if (ptr) - isl_ast_node_list_free(ptr); -} - -__isl_give isl_ast_node_list *ast_node_list::copy() const & { - return isl_ast_node_list_copy(ptr); + return isl::map(*this).factor_domain(); } -__isl_keep isl_ast_node_list *ast_node_list::get() const { - return ptr; +isl::map basic_map::factor_range() const +{ + return isl::map(*this).factor_range(); } -__isl_give isl_ast_node_list *ast_node_list::release() { - isl_ast_node_list *tmp = ptr; - ptr = nullptr; - return tmp; +isl::basic_map basic_map::fix_si(isl::dim type, unsigned int pos, int value) const +{ + auto res = isl_basic_map_fix_si(copy(), static_cast(type), pos, value); + return manage(res); } -bool ast_node_list::is_null() const { - return ptr == nullptr; +isl::basic_map basic_map::fix_val(isl::dim type, unsigned int pos, isl::val v) const +{ + auto res = isl_basic_map_fix_val(copy(), static_cast(type), pos, v.release()); + return manage(res); } - -isl::ctx ast_node_list::ctx() const { - return isl::ctx(isl_ast_node_list_get_ctx(ptr)); +isl::basic_map basic_map::fix_val(isl::dim type, unsigned int pos, long v) const +{ + return this->fix_val(type, pos, isl::val(ctx(), v)); } -void ast_node_list::dump() const { - isl_ast_node_list_dump(get()); +isl::union_map basic_map::fixed_power(const isl::val &exp) const +{ + return isl::map(*this).fixed_power(exp); } - -isl::ast_node_list ast_node_list::add(isl::ast_node el) const +isl::union_map basic_map::fixed_power(long exp) const { - auto res = isl_ast_node_list_add(copy(), el.release()); - return manage(res); + return this->fixed_power(isl::val(ctx(), exp)); } -isl::ast_node_list ast_node_list::alloc(isl::ctx ctx, int n) +isl::map basic_map::flat_range_product(const isl::map &map2) const { - auto res = isl_ast_node_list_alloc(ctx.release(), n); - return manage(res); + return isl::map(*this).flat_range_product(map2); } -isl::ast_node_list ast_node_list::clear() const +isl::union_map basic_map::flat_range_product(const isl::union_map &umap2) const { - auto res = isl_ast_node_list_clear(copy()); - return manage(res); + return isl::map(*this).flat_range_product(umap2); } -isl::ast_node_list ast_node_list::concat(isl::ast_node_list list2) const +isl::basic_map basic_map::flatten() const { - auto res = isl_ast_node_list_concat(copy(), list2.release()); + auto res = isl_basic_map_flatten(copy()); return manage(res); } -isl::ast_node_list ast_node_list::drop(unsigned int first, unsigned int n) const +isl::basic_map basic_map::flatten_domain() const { - auto res = isl_ast_node_list_drop(copy(), first, n); + auto res = isl_basic_map_flatten_domain(copy()); return manage(res); } -stat ast_node_list::foreach(const std::function &fn) const +isl::basic_map basic_map::flatten_range() const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_ast_node *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_ast_node_list_foreach(get(), fn_lambda, &fn_data); + auto res = isl_basic_map_flatten_range(copy()); return manage(res); } -isl::ast_node_list ast_node_list::from_ast_node(isl::ast_node el) +isl::map basic_map::floordiv_val(const isl::val &d) const { - auto res = isl_ast_node_list_from_ast_node(el.release()); - return manage(res); + return isl::map(*this).floordiv_val(d); } -isl::ast_node ast_node_list::get_ast_node(int index) const +isl::map basic_map::floordiv_val(long d) const { - auto res = isl_ast_node_list_get_ast_node(get(), index); - return manage(res); + return this->floordiv_val(isl::val(ctx(), d)); } -isl::ast_node ast_node_list::get_at(int index) const +stat basic_map::foreach_basic_map(const std::function &fn) const { - auto res = isl_ast_node_list_get_at(get(), index); - return manage(res); + return isl::map(*this).foreach_basic_map(fn); } -isl::ast_node_list ast_node_list::insert(unsigned int pos, isl::ast_node el) const +stat basic_map::foreach_map(const std::function &fn) const { - auto res = isl_ast_node_list_insert(copy(), pos, el.release()); - return manage(res); + return isl::map(*this).foreach_map(fn); } -isl_size ast_node_list::n_ast_node() const +isl::basic_map basic_map::from_aff(isl::aff aff) { - auto res = isl_ast_node_list_n_ast_node(get()); - return res; + auto res = isl_basic_map_from_aff(aff.release()); + return manage(res); } -isl::ast_node_list ast_node_list::reverse() const +isl::basic_map basic_map::from_domain_and_range(isl::basic_set domain, isl::basic_set range) { - auto res = isl_ast_node_list_reverse(copy()); + auto res = isl_basic_map_from_domain_and_range(domain.release(), range.release()); return manage(res); } -isl::ast_node_list ast_node_list::set_ast_node(int index, isl::ast_node el) const +isl::basic_map basic_map::gist(isl::basic_map context) const { - auto res = isl_ast_node_list_set_ast_node(copy(), index, el.release()); + auto res = isl_basic_map_gist(copy(), context.release()); return manage(res); } -isl_size ast_node_list::size() const +isl::map basic_map::gist(const isl::map &context) const { - auto res = isl_ast_node_list_size(get()); - return res; + return isl::map(*this).gist(context); } -isl::ast_node_list ast_node_list::swap(unsigned int pos1, unsigned int pos2) const +isl::union_map basic_map::gist(const isl::union_map &context) const { - auto res = isl_ast_node_list_swap(copy(), pos1, pos2); - return manage(res); + return isl::map(*this).gist(context); } -// implementations for isl::basic_map -basic_map manage(__isl_take isl_basic_map *ptr) { - return basic_map(ptr); -} -basic_map manage_copy(__isl_keep isl_basic_map *ptr) { - ptr = isl_basic_map_copy(ptr); - return basic_map(ptr); +isl::map basic_map::gist_domain(const isl::set &context) const +{ + return isl::map(*this).gist_domain(context); } -basic_map::basic_map() - : ptr(nullptr) {} - -basic_map::basic_map(const basic_map &obj) - : ptr(nullptr) +isl::union_map basic_map::gist_domain(const isl::union_set &uset) const { - ptr = obj.copy(); + return isl::map(*this).gist_domain(uset); } - -basic_map::basic_map(__isl_take isl_basic_map *ptr) - : ptr(ptr) {} - -basic_map::basic_map(isl::ctx ctx, const std::string &str) +isl::map basic_map::gist_params(const isl::set &context) const { - auto res = isl_basic_map_read_from_str(ctx.release(), str.c_str()); - ptr = res; + return isl::map(*this).gist_params(context); } -basic_map &basic_map::operator=(basic_map obj) { - std::swap(this->ptr, obj.ptr); - return *this; +isl::union_map basic_map::gist_range(const isl::union_set &uset) const +{ + return isl::map(*this).gist_range(uset); } -basic_map::~basic_map() { - if (ptr) - isl_basic_map_free(ptr); +boolean basic_map::has_domain_tuple_id() const +{ + return isl::map(*this).has_domain_tuple_id(); } -__isl_give isl_basic_map *basic_map::copy() const & { - return isl_basic_map_copy(ptr); +boolean basic_map::has_equal_space(const isl::map &map2) const +{ + return isl::map(*this).has_equal_space(map2); } -__isl_keep isl_basic_map *basic_map::get() const { - return ptr; +boolean basic_map::has_range_tuple_id() const +{ + return isl::map(*this).has_range_tuple_id(); } -__isl_give isl_basic_map *basic_map::release() { - isl_basic_map *tmp = ptr; - ptr = nullptr; - return tmp; +boolean basic_map::has_tuple_id(isl::dim type) const +{ + return isl::map(*this).has_tuple_id(type); } -bool basic_map::is_null() const { - return ptr == nullptr; +boolean basic_map::has_tuple_name(isl::dim type) const +{ + return isl::map(*this).has_tuple_name(type); } - -isl::ctx basic_map::ctx() const { - return isl::ctx(isl_basic_map_get_ctx(ptr)); +isl::basic_map basic_map::intersect(isl::basic_map bmap2) const +{ + auto res = isl_basic_map_intersect(copy(), bmap2.release()); + return manage(res); } -void basic_map::dump() const { - isl_basic_map_dump(get()); +isl::map basic_map::intersect(const isl::map &map2) const +{ + return isl::map(*this).intersect(map2); } +isl::union_map basic_map::intersect(const isl::union_map &umap2) const +{ + return isl::map(*this).intersect(umap2); +} -isl::basic_map basic_map::add_constraint(isl::constraint constraint) const +isl::basic_map basic_map::intersect_domain(isl::basic_set bset) const { - auto res = isl_basic_map_add_constraint(copy(), constraint.release()); + auto res = isl_basic_map_intersect_domain(copy(), bset.release()); return manage(res); } -isl::basic_map basic_map::add_dims(isl::dim type, unsigned int n) const +isl::map basic_map::intersect_domain(const isl::set &set) const { - auto res = isl_basic_map_add_dims(copy(), static_cast(type), n); - return manage(res); + return isl::map(*this).intersect_domain(set); } -isl::basic_map basic_map::affine_hull() const +isl::union_map basic_map::intersect_domain(const isl::space &space) const { - auto res = isl_basic_map_affine_hull(copy()); - return manage(res); + return isl::map(*this).intersect_domain(space); } -isl::basic_map basic_map::align_params(isl::space model) const +isl::union_map basic_map::intersect_domain(const isl::union_set &uset) const { - auto res = isl_basic_map_align_params(copy(), model.release()); - return manage(res); + return isl::map(*this).intersect_domain(uset); } -isl::basic_map basic_map::apply_domain(isl::basic_map bmap2) const +isl::basic_map basic_map::intersect_domain(const isl::point &bset) const { - auto res = isl_basic_map_apply_domain(copy(), bmap2.release()); - return manage(res); + return this->intersect_domain(isl::basic_set(bset)); } -isl::basic_map basic_map::apply_range(isl::basic_map bmap2) const +isl::map basic_map::intersect_domain_factor_domain(const isl::map &factor) const { - auto res = isl_basic_map_apply_range(copy(), bmap2.release()); - return manage(res); + return isl::map(*this).intersect_domain_factor_domain(factor); } -boolean basic_map::can_curry() const +isl::union_map basic_map::intersect_domain_factor_domain(const isl::union_map &factor) const { - auto res = isl_basic_map_can_curry(get()); - return manage(res); + return isl::map(*this).intersect_domain_factor_domain(factor); } -boolean basic_map::can_uncurry() const +isl::map basic_map::intersect_domain_factor_range(const isl::map &factor) const { - auto res = isl_basic_map_can_uncurry(get()); - return manage(res); + return isl::map(*this).intersect_domain_factor_range(factor); } -boolean basic_map::can_zip() const +isl::union_map basic_map::intersect_domain_factor_range(const isl::union_map &factor) const { - auto res = isl_basic_map_can_zip(get()); - return manage(res); + return isl::map(*this).intersect_domain_factor_range(factor); } -isl::basic_map basic_map::curry() const +isl::map basic_map::intersect_params(const isl::set ¶ms) const { - auto res = isl_basic_map_curry(copy()); - return manage(res); + return isl::map(*this).intersect_params(params); } -isl::basic_set basic_map::deltas() const +isl::basic_map basic_map::intersect_range(isl::basic_set bset) const { - auto res = isl_basic_map_deltas(copy()); + auto res = isl_basic_map_intersect_range(copy(), bset.release()); return manage(res); } -isl::basic_map basic_map::deltas_map() const +isl::map basic_map::intersect_range(const isl::set &set) const { - auto res = isl_basic_map_deltas_map(copy()); - return manage(res); + return isl::map(*this).intersect_range(set); } -isl::basic_map basic_map::detect_equalities() const +isl::union_map basic_map::intersect_range(const isl::space &space) const { - auto res = isl_basic_map_detect_equalities(copy()); - return manage(res); + return isl::map(*this).intersect_range(space); } -isl_size basic_map::dim(isl::dim type) const +isl::union_map basic_map::intersect_range(const isl::union_set &uset) const { - auto res = isl_basic_map_dim(get(), static_cast(type)); - return res; + return isl::map(*this).intersect_range(uset); } -isl::basic_set basic_map::domain() const +isl::basic_map basic_map::intersect_range(const isl::point &bset) const { - auto res = isl_basic_map_domain(copy()); - return manage(res); + return this->intersect_range(isl::basic_set(bset)); } -isl::basic_map basic_map::domain_map() const +isl::map basic_map::intersect_range_factor_domain(const isl::map &factor) const { - auto res = isl_basic_map_domain_map(copy()); - return manage(res); + return isl::map(*this).intersect_range_factor_domain(factor); } -isl::basic_map basic_map::domain_product(isl::basic_map bmap2) const +isl::union_map basic_map::intersect_range_factor_domain(const isl::union_map &factor) const { - auto res = isl_basic_map_domain_product(copy(), bmap2.release()); - return manage(res); + return isl::map(*this).intersect_range_factor_domain(factor); } -isl::basic_map basic_map::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::map basic_map::intersect_range_factor_range(const isl::map &factor) const { - auto res = isl_basic_map_drop_constraints_involving_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::map(*this).intersect_range_factor_range(factor); } -isl::basic_map basic_map::drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::union_map basic_map::intersect_range_factor_range(const isl::union_map &factor) const { - auto res = isl_basic_map_drop_constraints_not_involving_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::map(*this).intersect_range_factor_range(factor); } -isl::basic_map basic_map::drop_unused_params() const +boolean basic_map::involves_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_basic_map_drop_unused_params(copy()); - return manage(res); + return isl::map(*this).involves_dims(type, first, n); } -isl::basic_map basic_map::eliminate(isl::dim type, unsigned int first, unsigned int n) const +boolean basic_map::is_bijective() const { - auto res = isl_basic_map_eliminate(copy(), static_cast(type), first, n); - return manage(res); + return isl::map(*this).is_bijective(); } -isl::basic_map basic_map::empty(isl::space space) +boolean basic_map::is_disjoint(const isl::map &map2) const { - auto res = isl_basic_map_empty(space.release()); - return manage(res); + return isl::map(*this).is_disjoint(map2); } -isl::basic_map basic_map::equal(isl::space space, unsigned int n_equal) +boolean basic_map::is_disjoint(const isl::union_map &umap2) const { - auto res = isl_basic_map_equal(space.release(), n_equal); - return manage(res); + return isl::map(*this).is_disjoint(umap2); } -isl::mat basic_map::equalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4, isl::dim c5) const +boolean basic_map::is_empty() const { - auto res = isl_basic_map_equalities_matrix(get(), static_cast(c1), static_cast(c2), static_cast(c3), static_cast(c4), static_cast(c5)); + auto res = isl_basic_map_is_empty(get()); return manage(res); } -isl::basic_map basic_map::equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const +boolean basic_map::is_equal(const isl::basic_map &bmap2) const { - auto res = isl_basic_map_equate(copy(), static_cast(type1), pos1, static_cast(type2), pos2); + auto res = isl_basic_map_is_equal(get(), bmap2.get()); return manage(res); } -int basic_map::find_dim_by_name(isl::dim type, const std::string &name) const +boolean basic_map::is_equal(const isl::map &map2) const { - auto res = isl_basic_map_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + return isl::map(*this).is_equal(map2); } -isl::basic_map basic_map::fix_si(isl::dim type, unsigned int pos, int value) const +boolean basic_map::is_equal(const isl::union_map &umap2) const { - auto res = isl_basic_map_fix_si(copy(), static_cast(type), pos, value); - return manage(res); + return isl::map(*this).is_equal(umap2); } -isl::basic_map basic_map::fix_val(isl::dim type, unsigned int pos, isl::val v) const +boolean basic_map::is_injective() const { - auto res = isl_basic_map_fix_val(copy(), static_cast(type), pos, v.release()); - return manage(res); + return isl::map(*this).is_injective(); } -isl::basic_map basic_map::flat_product(isl::basic_map bmap2) const +boolean basic_map::is_single_valued() const { - auto res = isl_basic_map_flat_product(copy(), bmap2.release()); - return manage(res); + return isl::map(*this).is_single_valued(); } -isl::basic_map basic_map::flat_range_product(isl::basic_map bmap2) const +boolean basic_map::is_strict_subset(const isl::map &map2) const { - auto res = isl_basic_map_flat_range_product(copy(), bmap2.release()); - return manage(res); + return isl::map(*this).is_strict_subset(map2); } -isl::basic_map basic_map::flatten() const +boolean basic_map::is_strict_subset(const isl::union_map &umap2) const { - auto res = isl_basic_map_flatten(copy()); - return manage(res); + return isl::map(*this).is_strict_subset(umap2); } -isl::basic_map basic_map::flatten_domain() const +boolean basic_map::is_subset(const isl::basic_map &bmap2) const { - auto res = isl_basic_map_flatten_domain(copy()); + auto res = isl_basic_map_is_subset(get(), bmap2.get()); return manage(res); } -isl::basic_map basic_map::flatten_range() const +boolean basic_map::is_subset(const isl::map &map2) const { - auto res = isl_basic_map_flatten_range(copy()); - return manage(res); + return isl::map(*this).is_subset(map2); } -stat basic_map::foreach_constraint(const std::function &fn) const +boolean basic_map::is_subset(const isl::union_map &umap2) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_constraint *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_basic_map_foreach_constraint(get(), fn_lambda, &fn_data); - return manage(res); + return isl::map(*this).is_subset(umap2); } -isl::basic_map basic_map::from_aff(isl::aff aff) +boolean basic_map::isa_map() const { - auto res = isl_basic_map_from_aff(aff.release()); - return manage(res); + return isl::map(*this).isa_map(); } -isl::basic_map basic_map::from_aff_list(isl::space domain_space, isl::aff_list list) +isl::map basic_map::lex_ge_at(const isl::multi_pw_aff &mpa) const { - auto res = isl_basic_map_from_aff_list(domain_space.release(), list.release()); - return manage(res); + return isl::map(*this).lex_ge_at(mpa); } -isl::basic_map basic_map::from_constraint(isl::constraint constraint) +isl::map basic_map::lex_gt_at(const isl::multi_pw_aff &mpa) const { - auto res = isl_basic_map_from_constraint(constraint.release()); - return manage(res); + return isl::map(*this).lex_gt_at(mpa); } -isl::basic_map basic_map::from_domain(isl::basic_set bset) +isl::map basic_map::lex_le_at(const isl::multi_pw_aff &mpa) const { - auto res = isl_basic_map_from_domain(bset.release()); - return manage(res); + return isl::map(*this).lex_le_at(mpa); } -isl::basic_map basic_map::from_domain_and_range(isl::basic_set domain, isl::basic_set range) +isl::map basic_map::lex_lt_at(const isl::multi_pw_aff &mpa) const { - auto res = isl_basic_map_from_domain_and_range(domain.release(), range.release()); - return manage(res); + return isl::map(*this).lex_lt_at(mpa); } -isl::basic_map basic_map::from_multi_aff(isl::multi_aff maff) +isl::map basic_map::lexmax() const { - auto res = isl_basic_map_from_multi_aff(maff.release()); + auto res = isl_basic_map_lexmax(copy()); return manage(res); } -isl::basic_map basic_map::from_qpolynomial(isl::qpolynomial qp) +isl::pw_multi_aff basic_map::lexmax_pw_multi_aff() const { - auto res = isl_basic_map_from_qpolynomial(qp.release()); - return manage(res); + return isl::map(*this).lexmax_pw_multi_aff(); } -isl::basic_map basic_map::from_range(isl::basic_set bset) +isl::map basic_map::lexmin() const { - auto res = isl_basic_map_from_range(bset.release()); + auto res = isl_basic_map_lexmin(copy()); return manage(res); } -isl::constraint_list basic_map::get_constraint_list() const +isl::pw_multi_aff basic_map::lexmin_pw_multi_aff() const { - auto res = isl_basic_map_get_constraint_list(get()); - return manage(res); + return isl::map(*this).lexmin_pw_multi_aff(); } -std::string basic_map::get_dim_name(isl::dim type, unsigned int pos) const +isl::map basic_map::lower_bound(const isl::multi_pw_aff &lower) const { - auto res = isl_basic_map_get_dim_name(get(), static_cast(type), pos); - std::string tmp(res); - return tmp; + return isl::map(*this).lower_bound(lower); } -isl::aff basic_map::get_div(int pos) const +isl::map basic_map::lower_bound_si(isl::dim type, unsigned int pos, int value) const { - auto res = isl_basic_map_get_div(get(), pos); - return manage(res); + return isl::map(*this).lower_bound_si(type, pos, value); } -isl::local_space basic_map::get_local_space() const +isl::map_list basic_map::map_list() const { - auto res = isl_basic_map_get_local_space(get()); - return manage(res); + return isl::map(*this).map_list(); } -isl::space basic_map::get_space() const +isl::multi_pw_aff basic_map::max_multi_pw_aff() const { - auto res = isl_basic_map_get_space(get()); - return manage(res); + return isl::map(*this).max_multi_pw_aff(); } -std::string basic_map::get_tuple_name(isl::dim type) const +isl::multi_pw_aff basic_map::min_multi_pw_aff() const { - auto res = isl_basic_map_get_tuple_name(get(), static_cast(type)); - std::string tmp(res); - return tmp; + return isl::map(*this).min_multi_pw_aff(); } -isl::basic_map basic_map::gist(isl::basic_map context) const +isl::map basic_map::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const { - auto res = isl_basic_map_gist(copy(), context.release()); - return manage(res); + return isl::map(*this).move_dims(dst_type, dst_pos, src_type, src_pos, n); } -isl::basic_map basic_map::gist_domain(isl::basic_set context) const +class size basic_map::n_basic_map() const { - auto res = isl_basic_map_gist_domain(copy(), context.release()); - return manage(res); + return isl::map(*this).n_basic_map(); } -boolean basic_map::has_dim_id(isl::dim type, unsigned int pos) const +isl::map basic_map::order_lt(isl::dim type1, int pos1, isl::dim type2, int pos2) const { - auto res = isl_basic_map_has_dim_id(get(), static_cast(type), pos); - return manage(res); + return isl::map(*this).order_lt(type1, pos1, type2, pos2); } -isl::basic_map basic_map::identity(isl::space space) +isl::set basic_map::params() const { - auto res = isl_basic_map_identity(space.release()); - return manage(res); + return isl::map(*this).params(); } -boolean basic_map::image_is_bounded() const +isl::val basic_map::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const { - auto res = isl_basic_map_image_is_bounded(get()); + auto res = isl_basic_map_plain_get_val_if_fixed(get(), static_cast(type), pos); return manage(res); } -isl::mat basic_map::inequalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4, isl::dim c5) const +isl::basic_map basic_map::polyhedral_hull() const { - auto res = isl_basic_map_inequalities_matrix(get(), static_cast(c1), static_cast(c2), static_cast(c3), static_cast(c4), static_cast(c5)); - return manage(res); + return isl::map(*this).polyhedral_hull(); } -isl::basic_map basic_map::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const +isl::map basic_map::preimage_domain(const isl::multi_aff &ma) const { - auto res = isl_basic_map_insert_dims(copy(), static_cast(type), pos, n); - return manage(res); + return isl::map(*this).preimage_domain(ma); } -isl::basic_map basic_map::intersect(isl::basic_map bmap2) const +isl::map basic_map::preimage_domain(const isl::multi_pw_aff &mpa) const { - auto res = isl_basic_map_intersect(copy(), bmap2.release()); - return manage(res); + return isl::map(*this).preimage_domain(mpa); } -isl::basic_map basic_map::intersect_domain(isl::basic_set bset) const +isl::map basic_map::preimage_domain(const isl::pw_multi_aff &pma) const { - auto res = isl_basic_map_intersect_domain(copy(), bset.release()); - return manage(res); + return isl::map(*this).preimage_domain(pma); } -isl::basic_map basic_map::intersect_range(isl::basic_set bset) const +isl::union_map basic_map::preimage_domain(const isl::union_pw_multi_aff &upma) const { - auto res = isl_basic_map_intersect_range(copy(), bset.release()); - return manage(res); + return isl::map(*this).preimage_domain(upma); } -boolean basic_map::involves_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::map basic_map::preimage_range(const isl::multi_aff &ma) const { - auto res = isl_basic_map_involves_dims(get(), static_cast(type), first, n); - return manage(res); + return isl::map(*this).preimage_range(ma); } -boolean basic_map::is_disjoint(const isl::basic_map &bmap2) const +isl::map basic_map::preimage_range(const isl::pw_multi_aff &pma) const { - auto res = isl_basic_map_is_disjoint(get(), bmap2.get()); - return manage(res); + return isl::map(*this).preimage_range(pma); } -boolean basic_map::is_empty() const +isl::union_map basic_map::preimage_range(const isl::union_pw_multi_aff &upma) const { - auto res = isl_basic_map_is_empty(get()); - return manage(res); + return isl::map(*this).preimage_range(upma); } -boolean basic_map::is_equal(const isl::basic_map &bmap2) const +isl::map basic_map::product(const isl::map &map2) const { - auto res = isl_basic_map_is_equal(get(), bmap2.get()); - return manage(res); + return isl::map(*this).product(map2); } -boolean basic_map::is_rational() const +isl::union_map basic_map::product(const isl::union_map &umap2) const { - auto res = isl_basic_map_is_rational(get()); - return manage(res); + return isl::map(*this).product(umap2); } -boolean basic_map::is_single_valued() const +isl::map basic_map::project_out(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_basic_map_is_single_valued(get()); - return manage(res); + return isl::map(*this).project_out(type, first, n); } -boolean basic_map::is_strict_subset(const isl::basic_map &bmap2) const +isl::map basic_map::project_out_all_params() const { - auto res = isl_basic_map_is_strict_subset(get(), bmap2.get()); - return manage(res); + return isl::map(*this).project_out_all_params(); } -boolean basic_map::is_subset(const isl::basic_map &bmap2) const +isl::set basic_map::range() const { - auto res = isl_basic_map_is_subset(get(), bmap2.get()); - return manage(res); + return isl::map(*this).range(); } -boolean basic_map::is_universe() const +isl::map basic_map::range_factor_domain() const { - auto res = isl_basic_map_is_universe(get()); - return manage(res); + return isl::map(*this).range_factor_domain(); } -isl::basic_map basic_map::less_at(isl::space space, unsigned int pos) +isl::map basic_map::range_factor_range() const { - auto res = isl_basic_map_less_at(space.release(), pos); - return manage(res); + return isl::map(*this).range_factor_range(); } -isl::map basic_map::lexmax() const +isl::fixed_box basic_map::range_lattice_tile() const { - auto res = isl_basic_map_lexmax(copy()); - return manage(res); + return isl::map(*this).range_lattice_tile(); } -isl::map basic_map::lexmin() const +isl::map basic_map::range_map() const { - auto res = isl_basic_map_lexmin(copy()); - return manage(res); + return isl::map(*this).range_map(); } -isl::pw_multi_aff basic_map::lexmin_pw_multi_aff() const +isl::map basic_map::range_product(const isl::map &map2) const { - auto res = isl_basic_map_lexmin_pw_multi_aff(copy()); - return manage(res); + return isl::map(*this).range_product(map2); } -isl::basic_map basic_map::lower_bound_si(isl::dim type, unsigned int pos, int value) const +isl::union_map basic_map::range_product(const isl::union_map &umap2) const { - auto res = isl_basic_map_lower_bound_si(copy(), static_cast(type), pos, value); - return manage(res); + return isl::map(*this).range_product(umap2); } -isl::basic_map basic_map::more_at(isl::space space, unsigned int pos) +isl::map basic_map::range_reverse() const { - auto res = isl_basic_map_more_at(space.release(), pos); - return manage(res); + return isl::map(*this).range_reverse(); } -isl::basic_map basic_map::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const +isl::fixed_box basic_map::range_simple_fixed_box_hull() const { - auto res = isl_basic_map_move_dims(copy(), static_cast(dst_type), dst_pos, static_cast(src_type), src_pos, n); - return manage(res); + return isl::map(*this).range_simple_fixed_box_hull(); } -isl_size basic_map::n_constraint() const +class size basic_map::range_tuple_dim() const { - auto res = isl_basic_map_n_constraint(get()); - return res; + return isl::map(*this).range_tuple_dim(); } -isl::basic_map basic_map::nat_universe(isl::space space) +isl::id basic_map::range_tuple_id() const { - auto res = isl_basic_map_nat_universe(space.release()); - return manage(res); + return isl::map(*this).range_tuple_id(); } -isl::basic_map basic_map::neg() const +isl::basic_map basic_map::reverse() const { - auto res = isl_basic_map_neg(copy()); + auto res = isl_basic_map_reverse(copy()); return manage(res); } -isl::basic_map basic_map::order_ge(isl::dim type1, int pos1, isl::dim type2, int pos2) const +isl::basic_map basic_map::sample() const { - auto res = isl_basic_map_order_ge(copy(), static_cast(type1), pos1, static_cast(type2), pos2); + auto res = isl_basic_map_sample(copy()); return manage(res); } -isl::basic_map basic_map::order_gt(isl::dim type1, int pos1, isl::dim type2, int pos2) const +isl::map basic_map::set_domain_tuple(const isl::id &id) const { - auto res = isl_basic_map_order_gt(copy(), static_cast(type1), pos1, static_cast(type2), pos2); - return manage(res); + return isl::map(*this).set_domain_tuple(id); } -isl::val basic_map::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const +isl::map basic_map::set_domain_tuple(const std::string &id) const { - auto res = isl_basic_map_plain_get_val_if_fixed(get(), static_cast(type), pos); - return manage(res); + return this->set_domain_tuple(isl::id(ctx(), id)); } -boolean basic_map::plain_is_empty() const +isl::map basic_map::set_range_tuple(const isl::id &id) const { - auto res = isl_basic_map_plain_is_empty(get()); - return manage(res); + return isl::map(*this).set_range_tuple(id); } -boolean basic_map::plain_is_universe() const +isl::map basic_map::set_range_tuple(const std::string &id) const { - auto res = isl_basic_map_plain_is_universe(get()); - return manage(res); + return this->set_range_tuple(isl::id(ctx(), id)); } -isl::basic_map basic_map::preimage_domain_multi_aff(isl::multi_aff ma) const +isl::map basic_map::set_tuple_id(isl::dim type, const isl::id &id) const { - auto res = isl_basic_map_preimage_domain_multi_aff(copy(), ma.release()); - return manage(res); + return isl::map(*this).set_tuple_id(type, id); } -isl::basic_map basic_map::preimage_range_multi_aff(isl::multi_aff ma) const +isl::map basic_map::set_tuple_id(isl::dim type, const std::string &id) const { - auto res = isl_basic_map_preimage_range_multi_aff(copy(), ma.release()); - return manage(res); + return this->set_tuple_id(type, isl::id(ctx(), id)); } -isl::basic_map basic_map::product(isl::basic_map bmap2) const +isl::space basic_map::space() const { - auto res = isl_basic_map_product(copy(), bmap2.release()); - return manage(res); + return isl::map(*this).space(); } -isl::basic_map basic_map::project_out(isl::dim type, unsigned int first, unsigned int n) const +isl::map basic_map::subtract(const isl::map &map2) const { - auto res = isl_basic_map_project_out(copy(), static_cast(type), first, n); - return manage(res); + return isl::map(*this).subtract(map2); } -isl::basic_set basic_map::range() const +isl::union_map basic_map::subtract(const isl::union_map &umap2) const { - auto res = isl_basic_map_range(copy()); - return manage(res); + return isl::map(*this).subtract(umap2); } -isl::basic_map basic_map::range_map() const +isl::union_map basic_map::subtract_domain(const isl::union_set &dom) const { - auto res = isl_basic_map_range_map(copy()); - return manage(res); + return isl::map(*this).subtract_domain(dom); } -isl::basic_map basic_map::range_product(isl::basic_map bmap2) const +isl::union_map basic_map::subtract_range(const isl::union_set &dom) const { - auto res = isl_basic_map_range_product(copy(), bmap2.release()); - return manage(res); + return isl::map(*this).subtract_range(dom); } -isl::basic_map basic_map::remove_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::map basic_map::sum(const isl::map &map2) const { - auto res = isl_basic_map_remove_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::map(*this).sum(map2); } -isl::basic_map basic_map::remove_divs() const +isl::basic_map_list basic_map::to_list() const { - auto res = isl_basic_map_remove_divs(copy()); + auto res = isl_basic_map_to_list(copy()); return manage(res); } -isl::basic_map basic_map::remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::union_map basic_map::to_union_map() const { - auto res = isl_basic_map_remove_divs_involving_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::map(*this).to_union_map(); } -isl::basic_map basic_map::remove_redundancies() const +isl::id basic_map::tuple_id(isl::dim type) const { - auto res = isl_basic_map_remove_redundancies(copy()); - return manage(res); + return isl::map(*this).tuple_id(type); } -isl::basic_map basic_map::reverse() const +isl::map basic_map::uncurry() const { - auto res = isl_basic_map_reverse(copy()); - return manage(res); + return isl::map(*this).uncurry(); } -isl::basic_map basic_map::sample() const +isl::map basic_map::unite(isl::basic_map bmap2) const { - auto res = isl_basic_map_sample(copy()); + auto res = isl_basic_map_union(copy(), bmap2.release()); return manage(res); } -isl::basic_map basic_map::set_tuple_id(isl::dim type, isl::id id) const +isl::map basic_map::unite(const isl::map &map2) const { - auto res = isl_basic_map_set_tuple_id(copy(), static_cast(type), id.release()); - return manage(res); + return isl::map(*this).unite(map2); } -isl::basic_map basic_map::set_tuple_name(isl::dim type, const std::string &s) const +isl::union_map basic_map::unite(const isl::union_map &umap2) const { - auto res = isl_basic_map_set_tuple_name(copy(), static_cast(type), s.c_str()); - return manage(res); + return isl::map(*this).unite(umap2); } -isl::basic_map basic_map::sum(isl::basic_map bmap2) const +isl::basic_map basic_map::universe(isl::space space) { - auto res = isl_basic_map_sum(copy(), bmap2.release()); + auto res = isl_basic_map_universe(space.release()); return manage(res); } -isl::basic_map basic_map::uncurry() const +isl::basic_map basic_map::unshifted_simple_hull() const { - auto res = isl_basic_map_uncurry(copy()); - return manage(res); + return isl::map(*this).unshifted_simple_hull(); } -isl::map basic_map::unite(isl::basic_map bmap2) const +isl::map basic_map::upper_bound(const isl::multi_pw_aff &upper) const { - auto res = isl_basic_map_union(copy(), bmap2.release()); - return manage(res); + return isl::map(*this).upper_bound(upper); } -isl::basic_map basic_map::universe(isl::space space) +isl::map basic_map::upper_bound_si(isl::dim type, unsigned int pos, int value) const { - auto res = isl_basic_map_universe(space.release()); - return manage(res); + return isl::map(*this).upper_bound_si(type, pos, value); } -isl::basic_map basic_map::upper_bound_si(isl::dim type, unsigned int pos, int value) const +isl::set basic_map::wrap() const { - auto res = isl_basic_map_upper_bound_si(copy(), static_cast(type), pos, value); - return manage(res); + return isl::map(*this).wrap(); } -isl::basic_set basic_map::wrap() const +isl::map basic_map::zip() const { - auto res = isl_basic_map_wrap(copy()); - return manage(res); + return isl::map(*this).zip(); } -isl::basic_map basic_map::zip() const +inline std::ostream &operator<<(std::ostream &os, const basic_map &obj) { - auto res = isl_basic_map_zip(copy()); - return manage(res); + char *str = isl_basic_map_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } // implementations for isl::basic_map_list @@ -6258,10 +8895,20 @@ basic_map_list::basic_map_list(const basic_map_list &obj) ptr = obj.copy(); } - basic_map_list::basic_map_list(__isl_take isl_basic_map_list *ptr) : ptr(ptr) {} +basic_map_list::basic_map_list(isl::ctx ctx, int n) +{ + auto res = isl_basic_map_list_alloc(ctx.release(), n); + ptr = res; +} + +basic_map_list::basic_map_list(isl::basic_map el) +{ + auto res = isl_basic_map_list_from_basic_map(el.release()); + ptr = res; +} basic_map_list &basic_map_list::operator=(basic_map_list obj) { std::swap(this->ptr, obj.ptr); @@ -6291,28 +8938,27 @@ bool basic_map_list::is_null() const { return ptr == nullptr; } - isl::ctx basic_map_list::ctx() const { return isl::ctx(isl_basic_map_list_get_ctx(ptr)); } -void basic_map_list::dump() const { - isl_basic_map_list_dump(get()); -} - - isl::basic_map_list basic_map_list::add(isl::basic_map el) const { auto res = isl_basic_map_list_add(copy(), el.release()); return manage(res); } -isl::basic_map_list basic_map_list::alloc(isl::ctx ctx, int n) +isl::basic_map basic_map_list::at(int index) const { - auto res = isl_basic_map_list_alloc(ctx.release(), n); + auto res = isl_basic_map_list_get_at(get(), index); return manage(res); } +isl::basic_map basic_map_list::get_at(int index) const +{ + return at(index); +} + isl::basic_map_list basic_map_list::clear() const { auto res = isl_basic_map_list_clear(copy()); @@ -6331,72 +8977,42 @@ isl::basic_map_list basic_map_list::drop(unsigned int first, unsigned int n) con return manage(res); } -stat basic_map_list::foreach(const std::function &fn) const +stat basic_map_list::foreach(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; + std::function func; + } fn_data = { fn }; auto fn_lambda = [](isl_basic_map *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); + auto ret = (data->func)(manage(arg_0)); return ret.release(); }; auto res = isl_basic_map_list_foreach(get(), fn_lambda, &fn_data); return manage(res); } -isl::basic_map_list basic_map_list::from_basic_map(isl::basic_map el) -{ - auto res = isl_basic_map_list_from_basic_map(el.release()); - return manage(res); -} - -isl::basic_map basic_map_list::get_at(int index) const -{ - auto res = isl_basic_map_list_get_at(get(), index); - return manage(res); -} - -isl::basic_map basic_map_list::get_basic_map(int index) const -{ - auto res = isl_basic_map_list_get_basic_map(get(), index); - return manage(res); -} - isl::basic_map_list basic_map_list::insert(unsigned int pos, isl::basic_map el) const { auto res = isl_basic_map_list_insert(copy(), pos, el.release()); return manage(res); } -isl_size basic_map_list::n_basic_map() const -{ - auto res = isl_basic_map_list_n_basic_map(get()); - return res; -} - -isl::basic_map_list basic_map_list::reverse() const -{ - auto res = isl_basic_map_list_reverse(copy()); - return manage(res); -} - -isl::basic_map_list basic_map_list::set_basic_map(int index, isl::basic_map el) const -{ - auto res = isl_basic_map_list_set_basic_map(copy(), index, el.release()); - return manage(res); -} - -isl_size basic_map_list::size() const +class size basic_map_list::size() const { auto res = isl_basic_map_list_size(get()); - return res; + return manage(res); } -isl::basic_map_list basic_map_list::swap(unsigned int pos1, unsigned int pos2) const +inline std::ostream &operator<<(std::ostream &os, const basic_map_list &obj) { - auto res = isl_basic_map_list_swap(copy(), pos1, pos2); - return manage(res); + char *str = isl_basic_map_list_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } // implementations for isl::basic_set @@ -6417,7 +9033,6 @@ basic_set::basic_set(const basic_set &obj) ptr = obj.copy(); } - basic_set::basic_set(__isl_take isl_basic_set *ptr) : ptr(ptr) {} @@ -6426,6 +9041,7 @@ basic_set::basic_set(isl::point pnt) auto res = isl_basic_set_from_point(pnt.release()); ptr = res; } + basic_set::basic_set(isl::ctx ctx, const std::string &str) { auto res = isl_basic_set_read_from_str(ctx.release(), str.c_str()); @@ -6460,15 +9076,19 @@ bool basic_set::is_null() const { return ptr == nullptr; } - isl::ctx basic_set::ctx() const { return isl::ctx(isl_basic_set_get_ctx(ptr)); } -void basic_set::dump() const { - isl_basic_set_dump(get()); +isl::set basic_set::add_constraint(const isl::constraint &constraint) const +{ + return isl::set(*this).add_constraint(constraint); } +isl::set basic_set::add_dims(isl::dim type, unsigned int n) const +{ + return isl::set(*this).add_dims(type, n); +} isl::basic_set basic_set::affine_hull() const { @@ -6476,10 +9096,9 @@ isl::basic_set basic_set::affine_hull() const return manage(res); } -isl::basic_set basic_set::align_params(isl::space model) const +isl::set basic_set::align_params(const isl::space &model) const { - auto res = isl_basic_set_align_params(copy(), model.release()); - return manage(res); + return isl::set(*this).align_params(model); } isl::basic_set basic_set::apply(isl::basic_map bmap) const @@ -6488,960 +9107,968 @@ isl::basic_set basic_set::apply(isl::basic_map bmap) const return manage(res); } -isl::basic_set basic_set::box_from_points(isl::point pnt1, isl::point pnt2) +isl::set basic_set::apply(const isl::map &map) const { - auto res = isl_basic_set_box_from_points(pnt1.release(), pnt2.release()); - return manage(res); + return isl::set(*this).apply(map); } -isl::basic_set basic_set::coefficients() const +isl::union_set basic_set::apply(const isl::union_map &umap) const { - auto res = isl_basic_set_coefficients(copy()); - return manage(res); + return isl::set(*this).apply(umap); } -isl::basic_set basic_set::detect_equalities() const +isl::pw_multi_aff basic_set::as_pw_multi_aff() const { - auto res = isl_basic_set_detect_equalities(copy()); - return manage(res); + return isl::set(*this).as_pw_multi_aff(); } -isl_size basic_set::dim(isl::dim type) const +isl::set basic_set::as_set() const { - auto res = isl_basic_set_dim(get(), static_cast(type)); - return res; + return isl::set(*this).as_set(); } -isl::val basic_set::dim_max_val(int pos) const +isl::basic_set_list basic_set::basic_set_list() const { - auto res = isl_basic_set_dim_max_val(copy(), pos); - return manage(res); + return isl::set(*this).basic_set_list(); } -isl::basic_set basic_set::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::set basic_set::bind(const isl::multi_id &tuple) const { - auto res = isl_basic_set_drop_constraints_involving_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::set(*this).bind(tuple); } -isl::basic_set basic_set::drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::set basic_set::coalesce() const { - auto res = isl_basic_set_drop_constraints_not_involving_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::set(*this).coalesce(); } -isl::basic_set basic_set::drop_unused_params() const +isl::set basic_set::complement() const { - auto res = isl_basic_set_drop_unused_params(copy()); - return manage(res); + return isl::set(*this).complement(); } -isl::basic_set basic_set::eliminate(isl::dim type, unsigned int first, unsigned int n) const +isl::union_set basic_set::compute_divs() const { - auto res = isl_basic_set_eliminate(copy(), static_cast(type), first, n); - return manage(res); + return isl::set(*this).compute_divs(); } -isl::basic_set basic_set::empty(isl::space space) +boolean basic_set::contains(const isl::space &space) const { - auto res = isl_basic_set_empty(space.release()); - return manage(res); + return isl::set(*this).contains(space); } -isl::mat basic_set::equalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4) const +isl::basic_set basic_set::convex_hull() const { - auto res = isl_basic_set_equalities_matrix(get(), static_cast(c1), static_cast(c2), static_cast(c3), static_cast(c4)); - return manage(res); + return isl::set(*this).convex_hull(); } -isl::basic_set basic_set::fix_si(isl::dim type, unsigned int pos, int value) const +isl::basic_set basic_set::detect_equalities() const { - auto res = isl_basic_set_fix_si(copy(), static_cast(type), pos, value); + auto res = isl_basic_set_detect_equalities(copy()); return manage(res); } -isl::basic_set basic_set::fix_val(isl::dim type, unsigned int pos, isl::val v) const +class size basic_set::dim(isl::dim type) const { - auto res = isl_basic_set_fix_val(copy(), static_cast(type), pos, v.release()); + auto res = isl_basic_set_dim(get(), static_cast(type)); return manage(res); } -isl::basic_set basic_set::flat_product(isl::basic_set bset2) const +boolean basic_set::dim_has_any_lower_bound(isl::dim type, unsigned int pos) const { - auto res = isl_basic_set_flat_product(copy(), bset2.release()); - return manage(res); + return isl::set(*this).dim_has_any_lower_bound(type, pos); } -isl::basic_set basic_set::flatten() const +isl::id basic_set::dim_id(isl::dim type, unsigned int pos) const { - auto res = isl_basic_set_flatten(copy()); - return manage(res); + return isl::set(*this).dim_id(type, pos); } -stat basic_set::foreach_bound_pair(isl::dim type, unsigned int pos, const std::function &fn) const +isl::pw_aff basic_set::dim_max(int pos) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_constraint *arg_0, isl_constraint *arg_1, isl_basic_set *arg_2, void *arg_3) -> isl_stat { - auto *data = static_cast(arg_3); - stat ret = (*data->func)(manage(arg_0), manage(arg_1), manage(arg_2)); - return ret.release(); - }; - auto res = isl_basic_set_foreach_bound_pair(get(), static_cast(type), pos, fn_lambda, &fn_data); - return manage(res); + return isl::set(*this).dim_max(pos); } -stat basic_set::foreach_constraint(const std::function &fn) const +isl::val basic_set::dim_max_val(int pos) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_constraint *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_basic_set_foreach_constraint(get(), fn_lambda, &fn_data); + auto res = isl_basic_set_dim_max_val(copy(), pos); return manage(res); } -isl::basic_set basic_set::from_constraint(isl::constraint constraint) +isl::pw_aff basic_set::dim_min(int pos) const { - auto res = isl_basic_set_from_constraint(constraint.release()); - return manage(res); + return isl::set(*this).dim_min(pos); } -isl::basic_set basic_set::from_multi_aff(isl::multi_aff ma) +isl::val basic_set::dim_min_val(int pos) const { - auto res = isl_basic_set_from_multi_aff(ma.release()); - return manage(res); + return isl::set(*this).dim_min_val(pos); } -isl::basic_set basic_set::from_params() const +std::string basic_set::dim_name(isl::dim type, unsigned int pos) const { - auto res = isl_basic_set_from_params(copy()); - return manage(res); + return isl::set(*this).dim_name(type, pos); } -isl::constraint_list basic_set::get_constraint_list() const +isl::aff basic_set::div(int pos) const { - auto res = isl_basic_set_get_constraint_list(get()); + auto res = isl_basic_set_get_div(get(), pos); return manage(res); } -isl::id basic_set::get_dim_id(isl::dim type, unsigned int pos) const +isl::aff basic_set::get_div(int pos) const { - auto res = isl_basic_set_get_dim_id(get(), static_cast(type), pos); - return manage(res); + return div(pos); } -std::string basic_set::get_dim_name(isl::dim type, unsigned int pos) const +isl::set basic_set::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_basic_set_get_dim_name(get(), static_cast(type), pos); - std::string tmp(res); - return tmp; + return isl::set(*this).drop_constraints_involving_dims(type, first, n); } -isl::aff basic_set::get_div(int pos) const +isl::set basic_set::eliminate(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_basic_set_get_div(get(), pos); - return manage(res); + return isl::set(*this).eliminate(type, first, n); } -isl::local_space basic_set::get_local_space() const +boolean basic_set::every_set(const std::function &test) const { - auto res = isl_basic_set_get_local_space(get()); - return manage(res); + return isl::set(*this).every_set(test); } -isl::space basic_set::get_space() const +isl::set basic_set::extract_set(const isl::space &space) const { - auto res = isl_basic_set_get_space(get()); - return manage(res); + return isl::set(*this).extract_set(space); } -std::string basic_set::get_tuple_name() const +int basic_set::find_dim_by_id(isl::dim type, const isl::id &id) const { - auto res = isl_basic_set_get_tuple_name(get()); - std::string tmp(res); - return tmp; + return isl::set(*this).find_dim_by_id(type, id); } -isl::basic_set basic_set::gist(isl::basic_set context) const +int basic_set::find_dim_by_id(isl::dim type, const std::string &id) const { - auto res = isl_basic_set_gist(copy(), context.release()); - return manage(res); + return this->find_dim_by_id(type, isl::id(ctx(), id)); } -isl::mat basic_set::inequalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4) const +isl::basic_set basic_set::fix_si(isl::dim type, unsigned int pos, int value) const { - auto res = isl_basic_set_inequalities_matrix(get(), static_cast(c1), static_cast(c2), static_cast(c3), static_cast(c4)); + auto res = isl_basic_set_fix_si(copy(), static_cast(type), pos, value); return manage(res); } -isl::basic_set basic_set::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const +isl::basic_set basic_set::fix_val(isl::dim type, unsigned int pos, isl::val v) const { - auto res = isl_basic_set_insert_dims(copy(), static_cast(type), pos, n); + auto res = isl_basic_set_fix_val(copy(), static_cast(type), pos, v.release()); return manage(res); } -isl::basic_set basic_set::intersect(isl::basic_set bset2) const +isl::basic_set basic_set::fix_val(isl::dim type, unsigned int pos, long v) const { - auto res = isl_basic_set_intersect(copy(), bset2.release()); - return manage(res); + return this->fix_val(type, pos, isl::val(ctx(), v)); } -isl::basic_set basic_set::intersect_params(isl::basic_set bset2) const +isl::basic_set basic_set::flatten() const { - auto res = isl_basic_set_intersect_params(copy(), bset2.release()); + auto res = isl_basic_set_flatten(copy()); return manage(res); } -boolean basic_set::involves_dims(isl::dim type, unsigned int first, unsigned int n) const +stat basic_set::foreach_basic_set(const std::function &fn) const { - auto res = isl_basic_set_involves_dims(get(), static_cast(type), first, n); - return manage(res); + return isl::set(*this).foreach_basic_set(fn); } -boolean basic_set::is_bounded() const +stat basic_set::foreach_point(const std::function &fn) const { - auto res = isl_basic_set_is_bounded(get()); - return manage(res); + return isl::set(*this).foreach_point(fn); } -boolean basic_set::is_disjoint(const isl::basic_set &bset2) const +stat basic_set::foreach_set(const std::function &fn) const { - auto res = isl_basic_set_is_disjoint(get(), bset2.get()); - return manage(res); + return isl::set(*this).foreach_set(fn); } -boolean basic_set::is_empty() const +isl::basic_set basic_set::gist(isl::basic_set context) const { - auto res = isl_basic_set_is_empty(get()); + auto res = isl_basic_set_gist(copy(), context.release()); return manage(res); } -boolean basic_set::is_equal(const isl::basic_set &bset2) const +isl::set basic_set::gist(const isl::set &context) const { - auto res = isl_basic_set_is_equal(get(), bset2.get()); - return manage(res); + return isl::set(*this).gist(context); } -int basic_set::is_rational() const +isl::union_set basic_set::gist(const isl::union_set &context) const { - auto res = isl_basic_set_is_rational(get()); - return res; + return isl::set(*this).gist(context); } -boolean basic_set::is_subset(const isl::basic_set &bset2) const +isl::basic_set basic_set::gist(const isl::point &context) const { - auto res = isl_basic_set_is_subset(get(), bset2.get()); - return manage(res); + return this->gist(isl::basic_set(context)); } -boolean basic_set::is_universe() const +isl::set basic_set::gist_params(const isl::set &context) const { - auto res = isl_basic_set_is_universe(get()); - return manage(res); + return isl::set(*this).gist_params(context); } -boolean basic_set::is_wrapping() const +boolean basic_set::has_equal_space(const isl::set &set2) const { - auto res = isl_basic_set_is_wrapping(get()); - return manage(res); + return isl::set(*this).has_equal_space(set2); } -isl::set basic_set::lexmax() const +isl::map basic_set::identity() const { - auto res = isl_basic_set_lexmax(copy()); - return manage(res); + return isl::set(*this).identity(); } -isl::set basic_set::lexmin() const +isl::union_pw_multi_aff basic_set::identity_union_pw_multi_aff() const { - auto res = isl_basic_set_lexmin(copy()); - return manage(res); + return isl::set(*this).identity_union_pw_multi_aff(); } -isl::basic_set basic_set::lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const +isl::pw_aff basic_set::indicator_function() const { - auto res = isl_basic_set_lower_bound_val(copy(), static_cast(type), pos, value.release()); - return manage(res); + return isl::set(*this).indicator_function(); } -isl::val basic_set::max_val(const isl::aff &obj) const +isl::set basic_set::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const { - auto res = isl_basic_set_max_val(get(), obj.get()); - return manage(res); + return isl::set(*this).insert_dims(type, pos, n); } -isl::basic_set basic_set::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const +isl::map basic_set::insert_domain(const isl::space &domain) const { - auto res = isl_basic_set_move_dims(copy(), static_cast(dst_type), dst_pos, static_cast(src_type), src_pos, n); - return manage(res); + return isl::set(*this).insert_domain(domain); } -isl_size basic_set::n_constraint() const +isl::basic_set basic_set::intersect(isl::basic_set bset2) const { - auto res = isl_basic_set_n_constraint(get()); - return res; + auto res = isl_basic_set_intersect(copy(), bset2.release()); + return manage(res); } -isl_size basic_set::n_dim() const +isl::set basic_set::intersect(const isl::set &set2) const { - auto res = isl_basic_set_n_dim(get()); - return res; + return isl::set(*this).intersect(set2); } -isl::basic_set basic_set::nat_universe(isl::space space) +isl::union_set basic_set::intersect(const isl::union_set &uset2) const { - auto res = isl_basic_set_nat_universe(space.release()); - return manage(res); + return isl::set(*this).intersect(uset2); } -isl::basic_set basic_set::neg() const +isl::basic_set basic_set::intersect(const isl::point &bset2) const { - auto res = isl_basic_set_neg(copy()); - return manage(res); + return this->intersect(isl::basic_set(bset2)); } -isl::basic_set basic_set::params() const +isl::basic_set basic_set::intersect_params(isl::basic_set bset2) const { - auto res = isl_basic_set_params(copy()); + auto res = isl_basic_set_intersect_params(copy(), bset2.release()); return manage(res); } -boolean basic_set::plain_is_empty() const +isl::set basic_set::intersect_params(const isl::set ¶ms) const { - auto res = isl_basic_set_plain_is_empty(get()); - return manage(res); + return isl::set(*this).intersect_params(params); } -boolean basic_set::plain_is_equal(const isl::basic_set &bset2) const +isl::basic_set basic_set::intersect_params(const isl::point &bset2) const { - auto res = isl_basic_set_plain_is_equal(get(), bset2.get()); - return manage(res); + return this->intersect_params(isl::basic_set(bset2)); } -boolean basic_set::plain_is_universe() const +boolean basic_set::involves_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_basic_set_plain_is_universe(get()); - return manage(res); + return isl::set(*this).involves_dims(type, first, n); } -isl::basic_set basic_set::positive_orthant(isl::space space) +boolean basic_set::involves_locals() const { - auto res = isl_basic_set_positive_orthant(space.release()); - return manage(res); + return isl::set(*this).involves_locals(); } -isl::basic_set basic_set::preimage_multi_aff(isl::multi_aff ma) const +boolean basic_set::is_bounded() const { - auto res = isl_basic_set_preimage_multi_aff(copy(), ma.release()); + auto res = isl_basic_set_is_bounded(get()); return manage(res); } -isl::basic_set basic_set::project_out(isl::dim type, unsigned int first, unsigned int n) const +boolean basic_set::is_disjoint(const isl::set &set2) const { - auto res = isl_basic_set_project_out(copy(), static_cast(type), first, n); - return manage(res); + return isl::set(*this).is_disjoint(set2); } -isl::mat basic_set::reduced_basis() const +boolean basic_set::is_disjoint(const isl::union_set &uset2) const { - auto res = isl_basic_set_reduced_basis(get()); - return manage(res); + return isl::set(*this).is_disjoint(uset2); } -isl::basic_set basic_set::remove_dims(isl::dim type, unsigned int first, unsigned int n) const +boolean basic_set::is_empty() const { - auto res = isl_basic_set_remove_dims(copy(), static_cast(type), first, n); + auto res = isl_basic_set_is_empty(get()); return manage(res); } -isl::basic_set basic_set::remove_divs() const +boolean basic_set::is_equal(const isl::basic_set &bset2) const { - auto res = isl_basic_set_remove_divs(copy()); + auto res = isl_basic_set_is_equal(get(), bset2.get()); return manage(res); } -isl::basic_set basic_set::remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const +boolean basic_set::is_equal(const isl::set &set2) const { - auto res = isl_basic_set_remove_divs_involving_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::set(*this).is_equal(set2); } -isl::basic_set basic_set::remove_redundancies() const +boolean basic_set::is_equal(const isl::union_set &uset2) const { - auto res = isl_basic_set_remove_redundancies(copy()); - return manage(res); + return isl::set(*this).is_equal(uset2); } -isl::basic_set basic_set::remove_unknown_divs() const +boolean basic_set::is_equal(const isl::point &bset2) const { - auto res = isl_basic_set_remove_unknown_divs(copy()); - return manage(res); + return this->is_equal(isl::basic_set(bset2)); } -isl::basic_set basic_set::sample() const +boolean basic_set::is_params() const { - auto res = isl_basic_set_sample(copy()); - return manage(res); + return isl::set(*this).is_params(); } -isl::point basic_set::sample_point() const +boolean basic_set::is_singleton() const { - auto res = isl_basic_set_sample_point(copy()); - return manage(res); + return isl::set(*this).is_singleton(); } -isl::basic_set basic_set::set_tuple_id(isl::id id) const +boolean basic_set::is_strict_subset(const isl::set &set2) const { - auto res = isl_basic_set_set_tuple_id(copy(), id.release()); - return manage(res); + return isl::set(*this).is_strict_subset(set2); } -isl::basic_set basic_set::set_tuple_name(const std::string &s) const +boolean basic_set::is_strict_subset(const isl::union_set &uset2) const { - auto res = isl_basic_set_set_tuple_name(copy(), s.c_str()); - return manage(res); + return isl::set(*this).is_strict_subset(uset2); } -isl::basic_set basic_set::solutions() const +boolean basic_set::is_subset(const isl::basic_set &bset2) const { - auto res = isl_basic_set_solutions(copy()); + auto res = isl_basic_set_is_subset(get(), bset2.get()); return manage(res); } -isl::set basic_set::unite(isl::basic_set bset2) const +boolean basic_set::is_subset(const isl::set &set2) const { - auto res = isl_basic_set_union(copy(), bset2.release()); - return manage(res); + return isl::set(*this).is_subset(set2); } -isl::basic_set basic_set::universe(isl::space space) +boolean basic_set::is_subset(const isl::union_set &uset2) const { - auto res = isl_basic_set_universe(space.release()); - return manage(res); + return isl::set(*this).is_subset(uset2); } -isl::basic_map basic_set::unwrap() const +boolean basic_set::is_subset(const isl::point &bset2) const { - auto res = isl_basic_set_unwrap(copy()); - return manage(res); + return this->is_subset(isl::basic_set(bset2)); } -isl::basic_set basic_set::upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const +boolean basic_set::is_wrapping() const { - auto res = isl_basic_set_upper_bound_val(copy(), static_cast(type), pos, value.release()); + auto res = isl_basic_set_is_wrapping(get()); return manage(res); } -// implementations for isl::basic_set_list -basic_set_list manage(__isl_take isl_basic_set_list *ptr) { - return basic_set_list(ptr); -} -basic_set_list manage_copy(__isl_keep isl_basic_set_list *ptr) { - ptr = isl_basic_set_list_copy(ptr); - return basic_set_list(ptr); +boolean basic_set::isa_set() const +{ + return isl::set(*this).isa_set(); } -basic_set_list::basic_set_list() - : ptr(nullptr) {} - -basic_set_list::basic_set_list(const basic_set_list &obj) - : ptr(nullptr) +isl::set basic_set::lexmax() const { - ptr = obj.copy(); + auto res = isl_basic_set_lexmax(copy()); + return manage(res); } +isl::pw_multi_aff basic_set::lexmax_pw_multi_aff() const +{ + return isl::set(*this).lexmax_pw_multi_aff(); +} -basic_set_list::basic_set_list(__isl_take isl_basic_set_list *ptr) - : ptr(ptr) {} - +isl::set basic_set::lexmin() const +{ + auto res = isl_basic_set_lexmin(copy()); + return manage(res); +} -basic_set_list &basic_set_list::operator=(basic_set_list obj) { - std::swap(this->ptr, obj.ptr); - return *this; +isl::pw_multi_aff basic_set::lexmin_pw_multi_aff() const +{ + return isl::set(*this).lexmin_pw_multi_aff(); } -basic_set_list::~basic_set_list() { - if (ptr) - isl_basic_set_list_free(ptr); +isl::set basic_set::lower_bound(const isl::multi_pw_aff &lower) const +{ + return isl::set(*this).lower_bound(lower); } -__isl_give isl_basic_set_list *basic_set_list::copy() const & { - return isl_basic_set_list_copy(ptr); +isl::set basic_set::lower_bound(const isl::multi_val &lower) const +{ + return isl::set(*this).lower_bound(lower); } -__isl_keep isl_basic_set_list *basic_set_list::get() const { - return ptr; +isl::set basic_set::lower_bound_si(isl::dim type, unsigned int pos, int value) const +{ + return isl::set(*this).lower_bound_si(type, pos, value); } -__isl_give isl_basic_set_list *basic_set_list::release() { - isl_basic_set_list *tmp = ptr; - ptr = nullptr; - return tmp; +isl::set basic_set::lower_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const +{ + return isl::set(*this).lower_bound_val(type, pos, value); +} + +isl::set basic_set::lower_bound_val(isl::dim type, unsigned int pos, long value) const +{ + return this->lower_bound_val(type, pos, isl::val(ctx(), value)); } -bool basic_set_list::is_null() const { - return ptr == nullptr; +isl::multi_pw_aff basic_set::max_multi_pw_aff() const +{ + return isl::set(*this).max_multi_pw_aff(); } +isl::val basic_set::max_val(const isl::aff &obj) const +{ + return isl::set(*this).max_val(obj); +} -isl::ctx basic_set_list::ctx() const { - return isl::ctx(isl_basic_set_list_get_ctx(ptr)); +isl::multi_pw_aff basic_set::min_multi_pw_aff() const +{ + return isl::set(*this).min_multi_pw_aff(); } -void basic_set_list::dump() const { - isl_basic_set_list_dump(get()); +isl::val basic_set::min_val(const isl::aff &obj) const +{ + return isl::set(*this).min_val(obj); } +class size basic_set::n_basic_set() const +{ + return isl::set(*this).n_basic_set(); +} -isl::basic_set_list basic_set_list::add(isl::basic_set el) const +isl::basic_set basic_set::params() const { - auto res = isl_basic_set_list_add(copy(), el.release()); + auto res = isl_basic_set_params(copy()); return manage(res); } -isl::basic_set_list basic_set_list::alloc(isl::ctx ctx, int n) +isl::val basic_set::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const { - auto res = isl_basic_set_list_alloc(ctx.release(), n); - return manage(res); + return isl::set(*this).plain_get_val_if_fixed(type, pos); } -isl::basic_set_list basic_set_list::clear() const +isl::multi_val basic_set::plain_multi_val_if_fixed() const { - auto res = isl_basic_set_list_clear(copy()); - return manage(res); + return isl::set(*this).plain_multi_val_if_fixed(); } -isl::basic_set_list basic_set_list::coefficients() const +isl::basic_set basic_set::polyhedral_hull() const { - auto res = isl_basic_set_list_coefficients(copy()); - return manage(res); + return isl::set(*this).polyhedral_hull(); } -isl::basic_set_list basic_set_list::concat(isl::basic_set_list list2) const +isl::set basic_set::preimage(const isl::multi_aff &ma) const { - auto res = isl_basic_set_list_concat(copy(), list2.release()); - return manage(res); + return isl::set(*this).preimage(ma); } -isl::basic_set_list basic_set_list::drop(unsigned int first, unsigned int n) const +isl::set basic_set::preimage(const isl::multi_pw_aff &mpa) const { - auto res = isl_basic_set_list_drop(copy(), first, n); - return manage(res); + return isl::set(*this).preimage(mpa); } -stat basic_set_list::foreach(const std::function &fn) const +isl::set basic_set::preimage(const isl::pw_multi_aff &pma) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_basic_set *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_basic_set_list_foreach(get(), fn_lambda, &fn_data); - return manage(res); + return isl::set(*this).preimage(pma); } -isl::basic_set_list basic_set_list::from_basic_set(isl::basic_set el) +isl::union_set basic_set::preimage(const isl::union_pw_multi_aff &upma) const { - auto res = isl_basic_set_list_from_basic_set(el.release()); - return manage(res); + return isl::set(*this).preimage(upma); } -isl::basic_set basic_set_list::get_at(int index) const +isl::set basic_set::product(const isl::set &set2) const { - auto res = isl_basic_set_list_get_at(get(), index); - return manage(res); + return isl::set(*this).product(set2); } -isl::basic_set basic_set_list::get_basic_set(int index) const +isl::basic_set basic_set::project_out(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_basic_set_list_get_basic_set(get(), index); + auto res = isl_basic_set_project_out(copy(), static_cast(type), first, n); return manage(res); } -isl::basic_set_list basic_set_list::insert(unsigned int pos, isl::basic_set el) const +isl::set basic_set::project_out_all_params() const { - auto res = isl_basic_set_list_insert(copy(), pos, el.release()); - return manage(res); + return isl::set(*this).project_out_all_params(); } -isl_size basic_set_list::n_basic_set() const +isl::set basic_set::project_out_param(const isl::id &id) const { - auto res = isl_basic_set_list_n_basic_set(get()); - return res; + return isl::set(*this).project_out_param(id); } -isl::basic_set_list basic_set_list::reverse() const +isl::set basic_set::project_out_param(const std::string &id) const { - auto res = isl_basic_set_list_reverse(copy()); - return manage(res); + return this->project_out_param(isl::id(ctx(), id)); } -isl::basic_set_list basic_set_list::set_basic_set(int index, isl::basic_set el) const +isl::set basic_set::project_out_param(const isl::id_list &list) const { - auto res = isl_basic_set_list_set_basic_set(copy(), index, el.release()); - return manage(res); + return isl::set(*this).project_out_param(list); } -isl_size basic_set_list::size() const +isl::pw_multi_aff basic_set::pw_multi_aff_on_domain(const isl::multi_val &mv) const { - auto res = isl_basic_set_list_size(get()); - return res; + return isl::set(*this).pw_multi_aff_on_domain(mv); } -isl::basic_set_list basic_set_list::swap(unsigned int pos1, unsigned int pos2) const +isl::set basic_set::remove_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_basic_set_list_swap(copy(), pos1, pos2); - return manage(res); + return isl::set(*this).remove_dims(type, first, n); } -// implementations for isl::constraint -constraint manage(__isl_take isl_constraint *ptr) { - return constraint(ptr); +isl::set basic_set::remove_divs() const +{ + return isl::set(*this).remove_divs(); } -constraint manage_copy(__isl_keep isl_constraint *ptr) { - ptr = isl_constraint_copy(ptr); - return constraint(ptr); + +isl::set basic_set::remove_redundancies() const +{ + return isl::set(*this).remove_redundancies(); } -constraint::constraint() - : ptr(nullptr) {} +isl::set basic_set::reset_tuple_id() const +{ + return isl::set(*this).reset_tuple_id(); +} -constraint::constraint(const constraint &obj) - : ptr(nullptr) +isl::basic_set basic_set::sample() const { - ptr = obj.copy(); + auto res = isl_basic_set_sample(copy()); + return manage(res); } +isl::point basic_set::sample_point() const +{ + auto res = isl_basic_set_sample_point(copy()); + return manage(res); +} -constraint::constraint(__isl_take isl_constraint *ptr) - : ptr(ptr) {} +isl::set basic_set::set_dim_id(isl::dim type, unsigned int pos, const isl::id &id) const +{ + return isl::set(*this).set_dim_id(type, pos, id); +} +isl::set basic_set::set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const +{ + return this->set_dim_id(type, pos, isl::id(ctx(), id)); +} -constraint &constraint::operator=(constraint obj) { - std::swap(this->ptr, obj.ptr); - return *this; +isl::set_list basic_set::set_list() const +{ + return isl::set(*this).set_list(); } -constraint::~constraint() { - if (ptr) - isl_constraint_free(ptr); +isl::set basic_set::set_tuple_id(const isl::id &id) const +{ + return isl::set(*this).set_tuple_id(id); } -__isl_give isl_constraint *constraint::copy() const & { - return isl_constraint_copy(ptr); +isl::set basic_set::set_tuple_id(const std::string &id) const +{ + return this->set_tuple_id(isl::id(ctx(), id)); } -__isl_keep isl_constraint *constraint::get() const { - return ptr; +isl::fixed_box basic_set::simple_fixed_box_hull() const +{ + return isl::set(*this).simple_fixed_box_hull(); } -__isl_give isl_constraint *constraint::release() { - isl_constraint *tmp = ptr; - ptr = nullptr; - return tmp; +isl::basic_set basic_set::simple_hull() const +{ + return isl::set(*this).simple_hull(); } -bool constraint::is_null() const { - return ptr == nullptr; +isl::space basic_set::space() const +{ + auto res = isl_basic_set_get_space(get()); + return manage(res); } +isl::space basic_set::get_space() const +{ + return space(); +} -isl::ctx constraint::ctx() const { - return isl::ctx(isl_constraint_get_ctx(ptr)); +isl::val basic_set::stride(int pos) const +{ + return isl::set(*this).stride(pos); } -void constraint::dump() const { - isl_constraint_dump(get()); +isl::set basic_set::subtract(const isl::set &set2) const +{ + return isl::set(*this).subtract(set2); } +isl::union_set basic_set::subtract(const isl::union_set &uset2) const +{ + return isl::set(*this).subtract(uset2); +} -isl::constraint constraint::alloc_equality(isl::local_space ls) +isl::basic_set_list basic_set::to_list() const { - auto res = isl_constraint_alloc_equality(ls.release()); + auto res = isl_basic_set_to_list(copy()); return manage(res); } -isl::constraint constraint::alloc_inequality(isl::local_space ls) +isl::set basic_set::to_set() const { - auto res = isl_constraint_alloc_inequality(ls.release()); + auto res = isl_basic_set_to_set(copy()); return manage(res); } -int constraint::cmp_last_non_zero(const isl::constraint &c2) const +isl::union_set basic_set::to_union_set() const { - auto res = isl_constraint_cmp_last_non_zero(get(), c2.get()); - return res; + return isl::set(*this).to_union_set(); } -isl::aff constraint::get_aff() const +isl::map basic_set::translation() const { - auto res = isl_constraint_get_aff(get()); - return manage(res); + return isl::set(*this).translation(); } -isl::aff constraint::get_bound(isl::dim type, int pos) const +class size basic_set::tuple_dim() const { - auto res = isl_constraint_get_bound(get(), static_cast(type), pos); - return manage(res); + return isl::set(*this).tuple_dim(); } -isl::val constraint::get_coefficient_val(isl::dim type, int pos) const +isl::id basic_set::tuple_id() const { - auto res = isl_constraint_get_coefficient_val(get(), static_cast(type), pos); - return manage(res); + return isl::set(*this).tuple_id(); } -isl::val constraint::get_constant_val() const +std::string basic_set::tuple_name() const { - auto res = isl_constraint_get_constant_val(get()); - return manage(res); + return isl::set(*this).tuple_name(); } -std::string constraint::get_dim_name(isl::dim type, unsigned int pos) const +isl::set basic_set::unbind_params(const isl::multi_id &tuple) const { - auto res = isl_constraint_get_dim_name(get(), static_cast(type), pos); - std::string tmp(res); - return tmp; + return isl::set(*this).unbind_params(tuple); } -isl::aff constraint::get_div(int pos) const +isl::map basic_set::unbind_params_insert_domain(const isl::multi_id &domain) const { - auto res = isl_constraint_get_div(get(), pos); - return manage(res); + return isl::set(*this).unbind_params_insert_domain(domain); } -isl::local_space constraint::get_local_space() const +isl::set basic_set::unite(isl::basic_set bset2) const { - auto res = isl_constraint_get_local_space(get()); + auto res = isl_basic_set_union(copy(), bset2.release()); return manage(res); } -isl::space constraint::get_space() const +isl::set basic_set::unite(const isl::set &set2) const { - auto res = isl_constraint_get_space(get()); - return manage(res); + return isl::set(*this).unite(set2); } -boolean constraint::involves_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::union_set basic_set::unite(const isl::union_set &uset2) const { - auto res = isl_constraint_involves_dims(get(), static_cast(type), first, n); - return manage(res); + return isl::set(*this).unite(uset2); } -boolean constraint::is_div_constraint() const +isl::set basic_set::unite(const isl::point &bset2) const { - auto res = isl_constraint_is_div_constraint(get()); - return manage(res); + return this->unite(isl::basic_set(bset2)); } -boolean constraint::is_lower_bound(isl::dim type, unsigned int pos) const +isl::basic_set basic_set::universe(isl::space space) { - auto res = isl_constraint_is_lower_bound(get(), static_cast(type), pos); + auto res = isl_basic_set_universe(space.release()); return manage(res); } -boolean constraint::is_upper_bound(isl::dim type, unsigned int pos) const +isl::basic_set basic_set::unshifted_simple_hull() const { - auto res = isl_constraint_is_upper_bound(get(), static_cast(type), pos); - return manage(res); + return isl::set(*this).unshifted_simple_hull(); } -int constraint::plain_cmp(const isl::constraint &c2) const +isl::map basic_set::unwrap() const { - auto res = isl_constraint_plain_cmp(get(), c2.get()); - return res; + return isl::set(*this).unwrap(); } -isl::constraint constraint::set_coefficient_si(isl::dim type, int pos, int v) const +isl::set basic_set::upper_bound(const isl::multi_pw_aff &upper) const { - auto res = isl_constraint_set_coefficient_si(copy(), static_cast(type), pos, v); - return manage(res); + return isl::set(*this).upper_bound(upper); } -isl::constraint constraint::set_coefficient_val(isl::dim type, int pos, isl::val v) const +isl::set basic_set::upper_bound(const isl::multi_val &upper) const { - auto res = isl_constraint_set_coefficient_val(copy(), static_cast(type), pos, v.release()); - return manage(res); + return isl::set(*this).upper_bound(upper); } -isl::constraint constraint::set_constant_si(int v) const +isl::set basic_set::upper_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const { - auto res = isl_constraint_set_constant_si(copy(), v); - return manage(res); + return isl::set(*this).upper_bound_val(type, pos, value); } -isl::constraint constraint::set_constant_val(isl::val v) const +isl::set basic_set::upper_bound_val(isl::dim type, unsigned int pos, long value) const { - auto res = isl_constraint_set_constant_val(copy(), v.release()); - return manage(res); + return this->upper_bound_val(type, pos, isl::val(ctx(), value)); +} + +inline std::ostream &operator<<(std::ostream &os, const basic_set &obj) +{ + char *str = isl_basic_set_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -// implementations for isl::constraint_list -constraint_list manage(__isl_take isl_constraint_list *ptr) { - return constraint_list(ptr); +// implementations for isl::basic_set_list +basic_set_list manage(__isl_take isl_basic_set_list *ptr) { + return basic_set_list(ptr); } -constraint_list manage_copy(__isl_keep isl_constraint_list *ptr) { - ptr = isl_constraint_list_copy(ptr); - return constraint_list(ptr); +basic_set_list manage_copy(__isl_keep isl_basic_set_list *ptr) { + ptr = isl_basic_set_list_copy(ptr); + return basic_set_list(ptr); } -constraint_list::constraint_list() +basic_set_list::basic_set_list() : ptr(nullptr) {} -constraint_list::constraint_list(const constraint_list &obj) +basic_set_list::basic_set_list(const basic_set_list &obj) : ptr(nullptr) { ptr = obj.copy(); } - -constraint_list::constraint_list(__isl_take isl_constraint_list *ptr) +basic_set_list::basic_set_list(__isl_take isl_basic_set_list *ptr) : ptr(ptr) {} +basic_set_list::basic_set_list(isl::ctx ctx, int n) +{ + auto res = isl_basic_set_list_alloc(ctx.release(), n); + ptr = res; +} + +basic_set_list::basic_set_list(isl::basic_set el) +{ + auto res = isl_basic_set_list_from_basic_set(el.release()); + ptr = res; +} -constraint_list &constraint_list::operator=(constraint_list obj) { +basic_set_list &basic_set_list::operator=(basic_set_list obj) { std::swap(this->ptr, obj.ptr); return *this; } -constraint_list::~constraint_list() { +basic_set_list::~basic_set_list() { if (ptr) - isl_constraint_list_free(ptr); + isl_basic_set_list_free(ptr); } -__isl_give isl_constraint_list *constraint_list::copy() const & { - return isl_constraint_list_copy(ptr); +__isl_give isl_basic_set_list *basic_set_list::copy() const & { + return isl_basic_set_list_copy(ptr); } -__isl_keep isl_constraint_list *constraint_list::get() const { +__isl_keep isl_basic_set_list *basic_set_list::get() const { return ptr; } -__isl_give isl_constraint_list *constraint_list::release() { - isl_constraint_list *tmp = ptr; +__isl_give isl_basic_set_list *basic_set_list::release() { + isl_basic_set_list *tmp = ptr; ptr = nullptr; return tmp; } -bool constraint_list::is_null() const { +bool basic_set_list::is_null() const { return ptr == nullptr; } - -isl::ctx constraint_list::ctx() const { - return isl::ctx(isl_constraint_list_get_ctx(ptr)); +isl::ctx basic_set_list::ctx() const { + return isl::ctx(isl_basic_set_list_get_ctx(ptr)); } -void constraint_list::dump() const { - isl_constraint_list_dump(get()); +isl::basic_set_list basic_set_list::add(isl::basic_set el) const +{ + auto res = isl_basic_set_list_add(copy(), el.release()); + return manage(res); } - -isl::constraint_list constraint_list::add(isl::constraint el) const +isl::basic_set basic_set_list::at(int index) const { - auto res = isl_constraint_list_add(copy(), el.release()); + auto res = isl_basic_set_list_get_at(get(), index); return manage(res); } -isl::constraint_list constraint_list::alloc(isl::ctx ctx, int n) +isl::basic_set basic_set_list::get_at(int index) const { - auto res = isl_constraint_list_alloc(ctx.release(), n); - return manage(res); + return at(index); } -isl::constraint_list constraint_list::clear() const +isl::basic_set_list basic_set_list::clear() const { - auto res = isl_constraint_list_clear(copy()); + auto res = isl_basic_set_list_clear(copy()); return manage(res); } -isl::constraint_list constraint_list::concat(isl::constraint_list list2) const +isl::basic_set_list basic_set_list::concat(isl::basic_set_list list2) const { - auto res = isl_constraint_list_concat(copy(), list2.release()); + auto res = isl_basic_set_list_concat(copy(), list2.release()); return manage(res); } -isl::constraint_list constraint_list::drop(unsigned int first, unsigned int n) const +isl::basic_set_list basic_set_list::drop(unsigned int first, unsigned int n) const { - auto res = isl_constraint_list_drop(copy(), first, n); + auto res = isl_basic_set_list_drop(copy(), first, n); return manage(res); } -stat constraint_list::foreach(const std::function &fn) const +stat basic_set_list::foreach(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_constraint *arg_0, void *arg_1) -> isl_stat { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_basic_set *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); + auto ret = (data->func)(manage(arg_0)); return ret.release(); }; - auto res = isl_constraint_list_foreach(get(), fn_lambda, &fn_data); + auto res = isl_basic_set_list_foreach(get(), fn_lambda, &fn_data); return manage(res); } -isl::constraint_list constraint_list::from_constraint(isl::constraint el) +isl::basic_set_list basic_set_list::insert(unsigned int pos, isl::basic_set el) const { - auto res = isl_constraint_list_from_constraint(el.release()); + auto res = isl_basic_set_list_insert(copy(), pos, el.release()); return manage(res); } -isl::constraint constraint_list::get_at(int index) const +class size basic_set_list::size() const { - auto res = isl_constraint_list_get_at(get(), index); + auto res = isl_basic_set_list_size(get()); return manage(res); } -isl::constraint constraint_list::get_constraint(int index) const +inline std::ostream &operator<<(std::ostream &os, const basic_set_list &obj) { - auto res = isl_constraint_list_get_constraint(get(), index); - return manage(res); + char *str = isl_basic_set_list_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} + +// implementations for isl::constraint +constraint manage(__isl_take isl_constraint *ptr) { + return constraint(ptr); +} +constraint manage_copy(__isl_keep isl_constraint *ptr) { + ptr = isl_constraint_copy(ptr); + return constraint(ptr); +} + +constraint::constraint() + : ptr(nullptr) {} + +constraint::constraint(const constraint &obj) + : ptr(nullptr) +{ + ptr = obj.copy(); +} + +constraint::constraint(__isl_take isl_constraint *ptr) + : ptr(ptr) {} + +constraint &constraint::operator=(constraint obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +constraint::~constraint() { + if (ptr) + isl_constraint_free(ptr); +} + +__isl_give isl_constraint *constraint::copy() const & { + return isl_constraint_copy(ptr); +} + +__isl_keep isl_constraint *constraint::get() const { + return ptr; +} + +__isl_give isl_constraint *constraint::release() { + isl_constraint *tmp = ptr; + ptr = nullptr; + return tmp; +} + +bool constraint::is_null() const { + return ptr == nullptr; +} + +isl::ctx constraint::ctx() const { + return isl::ctx(isl_constraint_get_ctx(ptr)); } -isl::constraint_list constraint_list::insert(unsigned int pos, isl::constraint el) const +isl::constraint constraint::alloc_equality(isl::local_space ls) { - auto res = isl_constraint_list_insert(copy(), pos, el.release()); + auto res = isl_constraint_alloc_equality(ls.release()); return manage(res); } -isl_size constraint_list::n_constraint() const +isl::constraint constraint::alloc_inequality(isl::local_space ls) { - auto res = isl_constraint_list_n_constraint(get()); - return res; + auto res = isl_constraint_alloc_inequality(ls.release()); + return manage(res); } -isl::constraint_list constraint_list::reverse() const +isl::constraint constraint::set_coefficient_si(isl::dim type, int pos, int v) const { - auto res = isl_constraint_list_reverse(copy()); + auto res = isl_constraint_set_coefficient_si(copy(), static_cast(type), pos, v); return manage(res); } -isl::constraint_list constraint_list::set_constraint(int index, isl::constraint el) const +isl::constraint constraint::set_constant_si(int v) const { - auto res = isl_constraint_list_set_constraint(copy(), index, el.release()); + auto res = isl_constraint_set_constant_si(copy(), v); return manage(res); } -isl_size constraint_list::size() const +isl::constraint constraint::set_constant_val(isl::val v) const { - auto res = isl_constraint_list_size(get()); - return res; + auto res = isl_constraint_set_constant_val(copy(), v.release()); + return manage(res); } -isl::constraint_list constraint_list::swap(unsigned int pos1, unsigned int pos2) const +isl::constraint constraint::set_constant_val(long v) const { - auto res = isl_constraint_list_swap(copy(), pos1, pos2); - return manage(res); + return this->set_constant_val(isl::val(ctx(), v)); } // implementations for isl::fixed_box @@ -7462,11 +10089,9 @@ fixed_box::fixed_box(const fixed_box &obj) ptr = obj.copy(); } - fixed_box::fixed_box(__isl_take isl_fixed_box *ptr) : ptr(ptr) {} - fixed_box &fixed_box::operator=(fixed_box obj) { std::swap(this->ptr, obj.ptr); return *this; @@ -7495,38 +10120,59 @@ bool fixed_box::is_null() const { return ptr == nullptr; } - isl::ctx fixed_box::ctx() const { return isl::ctx(isl_fixed_box_get_ctx(ptr)); } -void fixed_box::dump() const { - isl_fixed_box_dump(get()); +boolean fixed_box::is_valid() const +{ + auto res = isl_fixed_box_is_valid(get()); + return manage(res); } - -isl::multi_aff fixed_box::get_offset() const +isl::multi_aff fixed_box::offset() const { auto res = isl_fixed_box_get_offset(get()); return manage(res); } -isl::multi_val fixed_box::get_size() const +isl::multi_aff fixed_box::get_offset() const +{ + return offset(); +} + +isl::multi_val fixed_box::size() const { auto res = isl_fixed_box_get_size(get()); return manage(res); } -isl::space fixed_box::get_space() const +isl::multi_val fixed_box::get_size() const +{ + return size(); +} + +isl::space fixed_box::space() const { auto res = isl_fixed_box_get_space(get()); return manage(res); } -boolean fixed_box::is_valid() const +isl::space fixed_box::get_space() const { - auto res = isl_fixed_box_is_valid(get()); - return manage(res); + return space(); +} + +inline std::ostream &operator<<(std::ostream &os, const fixed_box &obj) +{ + char *str = isl_fixed_box_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } // implementations for isl::id @@ -7547,7 +10193,6 @@ id::id(const id &obj) ptr = obj.copy(); } - id::id(__isl_take isl_id *ptr) : ptr(ptr) {} @@ -7585,41 +10230,57 @@ bool id::is_null() const { return ptr == nullptr; } - isl::ctx id::ctx() const { return isl::ctx(isl_id_get_ctx(ptr)); } -void id::dump() const { - isl_id_dump(get()); -} - - isl::id id::alloc(isl::ctx ctx, const std::string &name, void * user) { auto res = isl_id_alloc(ctx.release(), name.c_str(), user); return manage(res); } -uint32_t id::get_hash() const +std::string id::name() const { - auto res = isl_id_get_hash(get()); - return res; + auto res = isl_id_get_name(get()); + std::string tmp(res); + return tmp; } std::string id::get_name() const { - auto res = isl_id_get_name(get()); - std::string tmp(res); - return tmp; + return name(); } -void * id::get_user() const +isl::id_list id::to_list() const +{ + auto res = isl_id_to_list(copy()); + return manage(res); +} + +void * id::user() const { auto res = isl_id_get_user(get()); return res; } +void * id::get_user() const +{ + return user(); +} + +inline std::ostream &operator<<(std::ostream &os, const id &obj) +{ + char *str = isl_id_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} + // implementations for isl::id_list id_list manage(__isl_take isl_id_list *ptr) { return id_list(ptr); @@ -7638,10 +10299,26 @@ id_list::id_list(const id_list &obj) ptr = obj.copy(); } - id_list::id_list(__isl_take isl_id_list *ptr) : ptr(ptr) {} +id_list::id_list(isl::ctx ctx, int n) +{ + auto res = isl_id_list_alloc(ctx.release(), n); + ptr = res; +} + +id_list::id_list(isl::id el) +{ + auto res = isl_id_list_from_id(el.release()); + ptr = res; +} + +id_list::id_list(isl::ctx ctx, const std::string &str) +{ + auto res = isl_id_list_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} id_list &id_list::operator=(id_list obj) { std::swap(this->ptr, obj.ptr); @@ -7671,28 +10348,32 @@ bool id_list::is_null() const { return ptr == nullptr; } - isl::ctx id_list::ctx() const { return isl::ctx(isl_id_list_get_ctx(ptr)); } -void id_list::dump() const { - isl_id_list_dump(get()); -} - - isl::id_list id_list::add(isl::id el) const { auto res = isl_id_list_add(copy(), el.release()); return manage(res); } -isl::id_list id_list::alloc(isl::ctx ctx, int n) +isl::id_list id_list::add(const std::string &el) const { - auto res = isl_id_list_alloc(ctx.release(), n); + return this->add(isl::id(ctx(), el)); +} + +isl::id id_list::at(int index) const +{ + auto res = isl_id_list_get_at(get(), index); return manage(res); } +isl::id id_list::get_at(int index) const +{ + return at(index); +} + isl::id_list id_list::clear() const { auto res = isl_id_list_clear(copy()); @@ -7711,72 +10392,47 @@ isl::id_list id_list::drop(unsigned int first, unsigned int n) const return manage(res); } -stat id_list::foreach(const std::function &fn) const +stat id_list::foreach(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; + std::function func; + } fn_data = { fn }; auto fn_lambda = [](isl_id *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); + auto ret = (data->func)(manage(arg_0)); return ret.release(); }; auto res = isl_id_list_foreach(get(), fn_lambda, &fn_data); return manage(res); } -isl::id_list id_list::from_id(isl::id el) -{ - auto res = isl_id_list_from_id(el.release()); - return manage(res); -} - -isl::id id_list::get_at(int index) const -{ - auto res = isl_id_list_get_at(get(), index); - return manage(res); -} - -isl::id id_list::get_id(int index) const -{ - auto res = isl_id_list_get_id(get(), index); - return manage(res); -} - isl::id_list id_list::insert(unsigned int pos, isl::id el) const { auto res = isl_id_list_insert(copy(), pos, el.release()); return manage(res); } -isl_size id_list::n_id() const -{ - auto res = isl_id_list_n_id(get()); - return res; -} - -isl::id_list id_list::reverse() const -{ - auto res = isl_id_list_reverse(copy()); - return manage(res); -} - -isl::id_list id_list::set_id(int index, isl::id el) const +isl::id_list id_list::insert(unsigned int pos, const std::string &el) const { - auto res = isl_id_list_set_id(copy(), index, el.release()); - return manage(res); + return this->insert(pos, isl::id(ctx(), el)); } -isl_size id_list::size() const +class size id_list::size() const { auto res = isl_id_list_size(get()); - return res; + return manage(res); } -isl::id_list id_list::swap(unsigned int pos1, unsigned int pos2) const +inline std::ostream &operator<<(std::ostream &os, const id_list &obj) { - auto res = isl_id_list_swap(copy(), pos1, pos2); - return manage(res); + char *str = isl_id_list_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } // implementations for isl::id_to_ast_expr @@ -7797,11 +10453,9 @@ id_to_ast_expr::id_to_ast_expr(const id_to_ast_expr &obj) ptr = obj.copy(); } - id_to_ast_expr::id_to_ast_expr(__isl_take isl_id_to_ast_expr *ptr) : ptr(ptr) {} - id_to_ast_expr &id_to_ast_expr::operator=(id_to_ast_expr obj) { std::swap(this->ptr, obj.ptr); return *this; @@ -7830,58 +10484,25 @@ bool id_to_ast_expr::is_null() const { return ptr == nullptr; } - isl::ctx id_to_ast_expr::ctx() const { return isl::ctx(isl_id_to_ast_expr_get_ctx(ptr)); } -void id_to_ast_expr::dump() const { - isl_id_to_ast_expr_dump(get()); -} - - isl::id_to_ast_expr id_to_ast_expr::alloc(isl::ctx ctx, int min_size) { auto res = isl_id_to_ast_expr_alloc(ctx.release(), min_size); return manage(res); } -isl::id_to_ast_expr id_to_ast_expr::drop(isl::id key) const -{ - auto res = isl_id_to_ast_expr_drop(copy(), key.release()); - return manage(res); -} - -stat id_to_ast_expr::foreach(const std::function &fn) const -{ - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_id *arg_0, isl_ast_expr *arg_1, void *arg_2) -> isl_stat { - auto *data = static_cast(arg_2); - stat ret = (*data->func)(manage(arg_0), manage(arg_1)); - return ret.release(); - }; - auto res = isl_id_to_ast_expr_foreach(get(), fn_lambda, &fn_data); - return manage(res); -} - -isl::ast_expr id_to_ast_expr::get(isl::id key) const -{ - auto res = isl_id_to_ast_expr_get(get(), key.release()); - return manage(res); -} - -boolean id_to_ast_expr::has(const isl::id &key) const +isl::id_to_ast_expr id_to_ast_expr::set(isl::id key, isl::ast_expr val) const { - auto res = isl_id_to_ast_expr_has(get(), key.get()); + auto res = isl_id_to_ast_expr_set(copy(), key.release(), val.release()); return manage(res); } -isl::id_to_ast_expr id_to_ast_expr::set(isl::id key, isl::ast_expr val) const +isl::id_to_ast_expr id_to_ast_expr::set(const std::string &key, const isl::ast_expr &val) const { - auto res = isl_id_to_ast_expr_set(copy(), key.release(), val.release()); - return manage(res); + return this->set(isl::id(ctx(), key), val); } // implementations for isl::local_space @@ -7902,7 +10523,6 @@ local_space::local_space(const local_space &obj) ptr = obj.copy(); } - local_space::local_space(__isl_take isl_local_space *ptr) : ptr(ptr) {} @@ -7917,182 +10537,31 @@ local_space &local_space::operator=(local_space obj) { return *this; } -local_space::~local_space() { - if (ptr) - isl_local_space_free(ptr); -} - -__isl_give isl_local_space *local_space::copy() const & { - return isl_local_space_copy(ptr); -} - -__isl_keep isl_local_space *local_space::get() const { - return ptr; -} - -__isl_give isl_local_space *local_space::release() { - isl_local_space *tmp = ptr; - ptr = nullptr; - return tmp; -} - -bool local_space::is_null() const { - return ptr == nullptr; -} - - -isl::ctx local_space::ctx() const { - return isl::ctx(isl_local_space_get_ctx(ptr)); -} - -void local_space::dump() const { - isl_local_space_dump(get()); -} - - -isl::local_space local_space::add_dims(isl::dim type, unsigned int n) const -{ - auto res = isl_local_space_add_dims(copy(), static_cast(type), n); - return manage(res); -} - -isl_size local_space::dim(isl::dim type) const -{ - auto res = isl_local_space_dim(get(), static_cast(type)); - return res; -} - -isl::local_space local_space::domain() const -{ - auto res = isl_local_space_domain(copy()); - return manage(res); -} - -isl::local_space local_space::drop_dims(isl::dim type, unsigned int first, unsigned int n) const -{ - auto res = isl_local_space_drop_dims(copy(), static_cast(type), first, n); - return manage(res); -} - -int local_space::find_dim_by_name(isl::dim type, const std::string &name) const -{ - auto res = isl_local_space_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; -} - -isl::local_space local_space::flatten_domain() const -{ - auto res = isl_local_space_flatten_domain(copy()); - return manage(res); -} - -isl::local_space local_space::flatten_range() const -{ - auto res = isl_local_space_flatten_range(copy()); - return manage(res); -} - -isl::local_space local_space::from_domain() const -{ - auto res = isl_local_space_from_domain(copy()); - return manage(res); -} - -isl::id local_space::get_dim_id(isl::dim type, unsigned int pos) const -{ - auto res = isl_local_space_get_dim_id(get(), static_cast(type), pos); - return manage(res); -} - -std::string local_space::get_dim_name(isl::dim type, unsigned int pos) const -{ - auto res = isl_local_space_get_dim_name(get(), static_cast(type), pos); - std::string tmp(res); - return tmp; -} - -isl::aff local_space::get_div(int pos) const -{ - auto res = isl_local_space_get_div(get(), pos); - return manage(res); -} - -isl::space local_space::get_space() const -{ - auto res = isl_local_space_get_space(get()); - return manage(res); -} - -boolean local_space::has_dim_id(isl::dim type, unsigned int pos) const -{ - auto res = isl_local_space_has_dim_id(get(), static_cast(type), pos); - return manage(res); -} - -boolean local_space::has_dim_name(isl::dim type, unsigned int pos) const -{ - auto res = isl_local_space_has_dim_name(get(), static_cast(type), pos); - return manage(res); -} - -isl::local_space local_space::insert_dims(isl::dim type, unsigned int first, unsigned int n) const -{ - auto res = isl_local_space_insert_dims(copy(), static_cast(type), first, n); - return manage(res); -} - -isl::local_space local_space::intersect(isl::local_space ls2) const -{ - auto res = isl_local_space_intersect(copy(), ls2.release()); - return manage(res); -} - -boolean local_space::is_equal(const isl::local_space &ls2) const -{ - auto res = isl_local_space_is_equal(get(), ls2.get()); - return manage(res); -} - -boolean local_space::is_params() const -{ - auto res = isl_local_space_is_params(get()); - return manage(res); -} - -boolean local_space::is_set() const -{ - auto res = isl_local_space_is_set(get()); - return manage(res); +local_space::~local_space() { + if (ptr) + isl_local_space_free(ptr); } -isl::local_space local_space::range() const -{ - auto res = isl_local_space_range(copy()); - return manage(res); +__isl_give isl_local_space *local_space::copy() const & { + return isl_local_space_copy(ptr); } -isl::local_space local_space::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const -{ - auto res = isl_local_space_set_dim_id(copy(), static_cast(type), pos, id.release()); - return manage(res); +__isl_keep isl_local_space *local_space::get() const { + return ptr; } -isl::local_space local_space::set_from_params() const -{ - auto res = isl_local_space_set_from_params(copy()); - return manage(res); +__isl_give isl_local_space *local_space::release() { + isl_local_space *tmp = ptr; + ptr = nullptr; + return tmp; } -isl::local_space local_space::set_tuple_id(isl::dim type, isl::id id) const -{ - auto res = isl_local_space_set_tuple_id(copy(), static_cast(type), id.release()); - return manage(res); +bool local_space::is_null() const { + return ptr == nullptr; } -isl::local_space local_space::wrap() const -{ - auto res = isl_local_space_wrap(copy()); - return manage(res); +isl::ctx local_space::ctx() const { + return isl::ctx(isl_local_space_get_ctx(ptr)); } // implementations for isl::map @@ -8113,7 +10582,6 @@ map::map(const map &obj) ptr = obj.copy(); } - map::map(__isl_take isl_map *ptr) : ptr(ptr) {} @@ -8122,6 +10590,7 @@ map::map(isl::basic_map bmap) auto res = isl_map_from_basic_map(bmap.release()); ptr = res; } + map::map(isl::ctx ctx, const std::string &str) { auto res = isl_map_read_from_str(ctx.release(), str.c_str()); @@ -8156,16 +10625,10 @@ bool map::is_null() const { return ptr == nullptr; } - isl::ctx map::ctx() const { return isl::ctx(isl_map_get_ctx(ptr)); } -void map::dump() const { - isl_map_dump(get()); -} - - isl::map map::add_constraint(isl::constraint constraint) const { auto res = isl_map_add_constraint(copy(), constraint.release()); @@ -8196,45 +10659,79 @@ isl::map map::apply_domain(isl::map map2) const return manage(res); } +isl::union_map map::apply_domain(const isl::union_map &umap2) const +{ + return isl::union_map(*this).apply_domain(umap2); +} + +isl::map map::apply_domain(const isl::basic_map &map2) const +{ + return this->apply_domain(isl::map(map2)); +} + isl::map map::apply_range(isl::map map2) const { auto res = isl_map_apply_range(copy(), map2.release()); return manage(res); } -isl::set map::bind_domain(isl::multi_id tuple) const +isl::union_map map::apply_range(const isl::union_map &umap2) const { - auto res = isl_map_bind_domain(copy(), tuple.release()); - return manage(res); + return isl::union_map(*this).apply_range(umap2); } -isl::set map::bind_range(isl::multi_id tuple) const +isl::map map::apply_range(const isl::basic_map &map2) const { - auto res = isl_map_bind_range(copy(), tuple.release()); + return this->apply_range(isl::map(map2)); +} + +isl::map map::as_map() const +{ + return isl::union_map(*this).as_map(); +} + +isl::multi_union_pw_aff map::as_multi_union_pw_aff() const +{ + return isl::union_map(*this).as_multi_union_pw_aff(); +} + +isl::pw_multi_aff map::as_pw_multi_aff() const +{ + auto res = isl_map_as_pw_multi_aff(copy()); return manage(res); } -boolean map::can_curry() const +isl::union_pw_multi_aff map::as_union_pw_multi_aff() const { - auto res = isl_map_can_curry(get()); + return isl::union_map(*this).as_union_pw_multi_aff(); +} + +isl::basic_map_list map::basic_map_list() const +{ + auto res = isl_map_get_basic_map_list(get()); return manage(res); } -boolean map::can_range_curry() const +isl::basic_map_list map::get_basic_map_list() const +{ + return basic_map_list(); +} + +isl::set map::bind_domain(isl::multi_id tuple) const { - auto res = isl_map_can_range_curry(get()); + auto res = isl_map_bind_domain(copy(), tuple.release()); return manage(res); } -boolean map::can_uncurry() const +isl::set map::bind_range(isl::multi_id tuple) const { - auto res = isl_map_can_uncurry(get()); + auto res = isl_map_bind_range(copy(), tuple.release()); return manage(res); } -boolean map::can_zip() const +boolean map::can_curry() const { - auto res = isl_map_can_zip(get()); + auto res = isl_map_can_curry(get()); return manage(res); } @@ -8250,10 +10747,9 @@ isl::map map::complement() const return manage(res); } -isl::basic_map map::convex_hull() const +isl::union_map map::compute_divs() const { - auto res = isl_map_convex_hull(copy()); - return manage(res); + return isl::union_map(*this).compute_divs(); } isl::map map::curry() const @@ -8268,22 +10764,16 @@ isl::set map::deltas() const return manage(res); } -isl::map map::deltas_map() const -{ - auto res = isl_map_deltas_map(copy()); - return manage(res); -} - isl::map map::detect_equalities() const { auto res = isl_map_detect_equalities(copy()); return manage(res); } -isl_size map::dim(isl::dim type) const +class size map::dim(isl::dim type) const { auto res = isl_map_dim(get(), static_cast(type)); - return res; + return manage(res); } isl::pw_aff map::dim_max(int pos) const @@ -8316,16 +10806,15 @@ isl::map map::domain_factor_range() const return manage(res); } -boolean map::domain_is_wrapping() const +isl::map map::domain_map() const { - auto res = isl_map_domain_is_wrapping(get()); + auto res = isl_map_domain_map(copy()); return manage(res); } -isl::map map::domain_map() const +isl::union_pw_multi_aff map::domain_map_union_pw_multi_aff() const { - auto res = isl_map_domain_map(copy()); - return manage(res); + return isl::union_map(*this).domain_map_union_pw_multi_aff(); } isl::map map::domain_product(isl::map map2) const @@ -8334,34 +10823,31 @@ isl::map map::domain_product(isl::map map2) const return manage(res); } -isl_size map::domain_tuple_dim() const +isl::union_map map::domain_product(const isl::union_map &umap2) const { - auto res = isl_map_domain_tuple_dim(get()); - return res; + return isl::union_map(*this).domain_product(umap2); } -isl::map map::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::map map::domain_product(const isl::basic_map &map2) const { - auto res = isl_map_drop_constraints_involving_dims(copy(), static_cast(type), first, n); - return manage(res); + return this->domain_product(isl::map(map2)); } -isl::map map::drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const +class size map::domain_tuple_dim() const { - auto res = isl_map_drop_constraints_not_involving_dims(copy(), static_cast(type), first, n); + auto res = isl_map_domain_tuple_dim(get()); return manage(res); } -isl::map map::drop_unused_params() const +isl::id map::domain_tuple_id() const { - auto res = isl_map_drop_unused_params(copy()); + auto res = isl_map_get_domain_tuple_id(get()); return manage(res); } -isl::map map::eliminate(isl::dim type, unsigned int first, unsigned int n) const +isl::id map::get_domain_tuple_id() const { - auto res = isl_map_eliminate(copy(), static_cast(type), first, n); - return manage(res); + return domain_tuple_id(); } isl::map map::empty(isl::space space) @@ -8376,72 +10862,91 @@ isl::map map::eq_at(isl::multi_pw_aff mpa) const return manage(res); } -isl::map map::equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const +isl::union_map map::eq_at(const isl::multi_union_pw_aff &mupa) const { - auto res = isl_map_equate(copy(), static_cast(type1), pos1, static_cast(type2), pos2); - return manage(res); + return isl::union_map(*this).eq_at(mupa); } -isl::map map::factor_domain() const +isl::map map::eq_at(const isl::aff &mpa) const { - auto res = isl_map_factor_domain(copy()); - return manage(res); + return this->eq_at(isl::multi_pw_aff(mpa)); } -isl::map map::factor_range() const +isl::map map::eq_at(const isl::multi_aff &mpa) const { - auto res = isl_map_factor_range(copy()); - return manage(res); + return this->eq_at(isl::multi_pw_aff(mpa)); } -int map::find_dim_by_id(isl::dim type, const isl::id &id) const +isl::map map::eq_at(const isl::pw_aff &mpa) const { - auto res = isl_map_find_dim_by_id(get(), static_cast(type), id.get()); - return res; + return this->eq_at(isl::multi_pw_aff(mpa)); } -int map::find_dim_by_name(isl::dim type, const std::string &name) const +isl::map map::eq_at(const isl::pw_multi_aff &mpa) const { - auto res = isl_map_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + return this->eq_at(isl::multi_pw_aff(mpa)); } -isl::map map::fix_si(isl::dim type, unsigned int pos, int value) const +isl::map map::equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const { - auto res = isl_map_fix_si(copy(), static_cast(type), pos, value); + auto res = isl_map_equate(copy(), static_cast(type1), pos1, static_cast(type2), pos2); return manage(res); } -isl::map map::fix_val(isl::dim type, unsigned int pos, isl::val v) const +boolean map::every_map(const std::function &test) const { - auto res = isl_map_fix_val(copy(), static_cast(type), pos, v.release()); - return manage(res); + return isl::union_map(*this).every_map(test); } -isl::map map::fixed_power_val(isl::val exp) const +isl::map map::extract_map(const isl::space &space) const +{ + return isl::union_map(*this).extract_map(space); +} + +isl::map map::factor_domain() const { - auto res = isl_map_fixed_power_val(copy(), exp.release()); + auto res = isl_map_factor_domain(copy()); return manage(res); } -isl::map map::flat_domain_product(isl::map map2) const +isl::map map::factor_range() const { - auto res = isl_map_flat_domain_product(copy(), map2.release()); + auto res = isl_map_factor_range(copy()); return manage(res); } -isl::map map::flat_product(isl::map map2) const +isl::map map::fix_si(isl::dim type, unsigned int pos, int value) const { - auto res = isl_map_flat_product(copy(), map2.release()); + auto res = isl_map_fix_si(copy(), static_cast(type), pos, value); return manage(res); } +isl::union_map map::fixed_power(const isl::val &exp) const +{ + return isl::union_map(*this).fixed_power(exp); +} + +isl::union_map map::fixed_power(long exp) const +{ + return this->fixed_power(isl::val(ctx(), exp)); +} + isl::map map::flat_range_product(isl::map map2) const { auto res = isl_map_flat_range_product(copy(), map2.release()); return manage(res); } +isl::union_map map::flat_range_product(const isl::union_map &umap2) const +{ + return isl::union_map(*this).flat_range_product(umap2); +} + +isl::map map::flat_range_product(const isl::basic_map &map2) const +{ + return this->flat_range_product(isl::map(map2)); +} + isl::map map::flatten() const { auto res = isl_map_flatten(copy()); @@ -8466,20 +10971,30 @@ isl::map map::floordiv_val(isl::val d) const return manage(res); } -stat map::foreach_basic_map(const std::function &fn) const +isl::map map::floordiv_val(long d) const +{ + return this->floordiv_val(isl::val(ctx(), d)); +} + +stat map::foreach_basic_map(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; + std::function func; + } fn_data = { fn }; auto fn_lambda = [](isl_basic_map *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); + auto ret = (data->func)(manage(arg_0)); return ret.release(); }; auto res = isl_map_foreach_basic_map(get(), fn_lambda, &fn_data); return manage(res); } +stat map::foreach_map(const std::function &fn) const +{ + return isl::union_map(*this).foreach_map(fn); +} + isl::map map::from_aff(isl::aff aff) { auto res = isl_map_from_aff(aff.release()); @@ -8504,24 +11019,12 @@ isl::map map::from_multi_aff(isl::multi_aff maff) return manage(res); } -isl::map map::from_multi_pw_aff(isl::multi_pw_aff mpa) -{ - auto res = isl_map_from_multi_pw_aff(mpa.release()); - return manage(res); -} - isl::map map::from_pw_aff(isl::pw_aff pwaff) { auto res = isl_map_from_pw_aff(pwaff.release()); return manage(res); } -isl::map map::from_pw_multi_aff(isl::pw_multi_aff pma) -{ - auto res = isl_map_from_pw_multi_aff(pma.release()); - return manage(res); -} - isl::map map::from_range(isl::set set) { auto res = isl_map_from_range(set.release()); @@ -8534,138 +11037,130 @@ isl::map map::from_union_map(isl::union_map umap) return manage(res); } -isl::basic_map_list map::get_basic_map_list() const +isl::map map::gist(isl::map context) const { - auto res = isl_map_get_basic_map_list(get()); + auto res = isl_map_gist(copy(), context.release()); return manage(res); } -isl::id map::get_dim_id(isl::dim type, unsigned int pos) const +isl::union_map map::gist(const isl::union_map &context) const { - auto res = isl_map_get_dim_id(get(), static_cast(type), pos); - return manage(res); + return isl::union_map(*this).gist(context); } -std::string map::get_dim_name(isl::dim type, unsigned int pos) const +isl::map map::gist(const isl::basic_map &context) const { - auto res = isl_map_get_dim_name(get(), static_cast(type), pos); - std::string tmp(res); - return tmp; + return this->gist(isl::map(context)); } -uint32_t map::get_hash() const +isl::map map::gist_domain(isl::set context) const { - auto res = isl_map_get_hash(get()); - return res; + auto res = isl_map_gist_domain(copy(), context.release()); + return manage(res); } -isl::fixed_box map::get_range_simple_fixed_box_hull() const +isl::union_map map::gist_domain(const isl::union_set &uset) const { - auto res = isl_map_get_range_simple_fixed_box_hull(get()); - return manage(res); + return isl::union_map(*this).gist_domain(uset); } -isl::space map::get_space() const +isl::map map::gist_domain(const isl::basic_set &context) const { - auto res = isl_map_get_space(get()); - return manage(res); + return this->gist_domain(isl::set(context)); } -isl::id map::get_tuple_id(isl::dim type) const +isl::map map::gist_domain(const isl::point &context) const { - auto res = isl_map_get_tuple_id(get(), static_cast(type)); + return this->gist_domain(isl::set(context)); +} + +isl::map map::gist_params(isl::set context) const +{ + auto res = isl_map_gist_params(copy(), context.release()); return manage(res); } -std::string map::get_tuple_name(isl::dim type) const +isl::union_map map::gist_range(const isl::union_set &uset) const { - auto res = isl_map_get_tuple_name(get(), static_cast(type)); - std::string tmp(res); - return tmp; + return isl::union_map(*this).gist_range(uset); } -isl::map map::gist(isl::map context) const +boolean map::has_domain_tuple_id() const { - auto res = isl_map_gist(copy(), context.release()); + auto res = isl_map_has_domain_tuple_id(get()); return manage(res); } -isl::map map::gist_basic_map(isl::basic_map context) const +boolean map::has_equal_space(const isl::map &map2) const { - auto res = isl_map_gist_basic_map(copy(), context.release()); + auto res = isl_map_has_equal_space(get(), map2.get()); return manage(res); } -isl::map map::gist_domain(isl::set context) const +boolean map::has_range_tuple_id() const { - auto res = isl_map_gist_domain(copy(), context.release()); + auto res = isl_map_has_range_tuple_id(get()); return manage(res); } -isl::map map::gist_params(isl::set context) const +boolean map::has_tuple_id(isl::dim type) const { - auto res = isl_map_gist_params(copy(), context.release()); + auto res = isl_map_has_tuple_id(get(), static_cast(type)); return manage(res); } -isl::map map::gist_range(isl::set context) const +boolean map::has_tuple_name(isl::dim type) const { - auto res = isl_map_gist_range(copy(), context.release()); + auto res = isl_map_has_tuple_name(get(), static_cast(type)); return manage(res); } -boolean map::has_dim_id(isl::dim type, unsigned int pos) const +isl::map map::identity(isl::space space) { - auto res = isl_map_has_dim_id(get(), static_cast(type), pos); + auto res = isl_map_identity(space.release()); return manage(res); } -boolean map::has_dim_name(isl::dim type, unsigned int pos) const +isl::map map::intersect(isl::map map2) const { - auto res = isl_map_has_dim_name(get(), static_cast(type), pos); + auto res = isl_map_intersect(copy(), map2.release()); return manage(res); } -boolean map::has_equal_space(const isl::map &map2) const +isl::union_map map::intersect(const isl::union_map &umap2) const { - auto res = isl_map_has_equal_space(get(), map2.get()); - return manage(res); + return isl::union_map(*this).intersect(umap2); } -boolean map::has_tuple_id(isl::dim type) const +isl::map map::intersect(const isl::basic_map &map2) const { - auto res = isl_map_has_tuple_id(get(), static_cast(type)); - return manage(res); + return this->intersect(isl::map(map2)); } -boolean map::has_tuple_name(isl::dim type) const +isl::map map::intersect_domain(isl::set set) const { - auto res = isl_map_has_tuple_name(get(), static_cast(type)); + auto res = isl_map_intersect_domain(copy(), set.release()); return manage(res); } -isl::map map::identity(isl::space space) +isl::union_map map::intersect_domain(const isl::space &space) const { - auto res = isl_map_identity(space.release()); - return manage(res); + return isl::union_map(*this).intersect_domain(space); } -isl::map map::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const +isl::union_map map::intersect_domain(const isl::union_set &uset) const { - auto res = isl_map_insert_dims(copy(), static_cast(type), pos, n); - return manage(res); + return isl::union_map(*this).intersect_domain(uset); } -isl::map map::intersect(isl::map map2) const +isl::map map::intersect_domain(const isl::basic_set &set) const { - auto res = isl_map_intersect(copy(), map2.release()); - return manage(res); + return this->intersect_domain(isl::set(set)); } -isl::map map::intersect_domain(isl::set set) const +isl::map map::intersect_domain(const isl::point &set) const { - auto res = isl_map_intersect_domain(copy(), set.release()); - return manage(res); + return this->intersect_domain(isl::set(set)); } isl::map map::intersect_domain_factor_domain(isl::map factor) const @@ -8674,12 +11169,32 @@ isl::map map::intersect_domain_factor_domain(isl::map factor) const return manage(res); } +isl::union_map map::intersect_domain_factor_domain(const isl::union_map &factor) const +{ + return isl::union_map(*this).intersect_domain_factor_domain(factor); +} + +isl::map map::intersect_domain_factor_domain(const isl::basic_map &factor) const +{ + return this->intersect_domain_factor_domain(isl::map(factor)); +} + isl::map map::intersect_domain_factor_range(isl::map factor) const { auto res = isl_map_intersect_domain_factor_range(copy(), factor.release()); return manage(res); } +isl::union_map map::intersect_domain_factor_range(const isl::union_map &factor) const +{ + return isl::union_map(*this).intersect_domain_factor_range(factor); +} + +isl::map map::intersect_domain_factor_range(const isl::basic_map &factor) const +{ + return this->intersect_domain_factor_range(isl::map(factor)); +} + isl::map map::intersect_params(isl::set params) const { auto res = isl_map_intersect_params(copy(), params.release()); @@ -8692,18 +11207,58 @@ isl::map map::intersect_range(isl::set set) const return manage(res); } +isl::union_map map::intersect_range(const isl::space &space) const +{ + return isl::union_map(*this).intersect_range(space); +} + +isl::union_map map::intersect_range(const isl::union_set &uset) const +{ + return isl::union_map(*this).intersect_range(uset); +} + +isl::map map::intersect_range(const isl::basic_set &set) const +{ + return this->intersect_range(isl::set(set)); +} + +isl::map map::intersect_range(const isl::point &set) const +{ + return this->intersect_range(isl::set(set)); +} + isl::map map::intersect_range_factor_domain(isl::map factor) const { auto res = isl_map_intersect_range_factor_domain(copy(), factor.release()); return manage(res); } +isl::union_map map::intersect_range_factor_domain(const isl::union_map &factor) const +{ + return isl::union_map(*this).intersect_range_factor_domain(factor); +} + +isl::map map::intersect_range_factor_domain(const isl::basic_map &factor) const +{ + return this->intersect_range_factor_domain(isl::map(factor)); +} + isl::map map::intersect_range_factor_range(isl::map factor) const { auto res = isl_map_intersect_range_factor_range(copy(), factor.release()); return manage(res); } +isl::union_map map::intersect_range_factor_range(const isl::union_map &factor) const +{ + return isl::union_map(*this).intersect_range_factor_range(factor); +} + +isl::map map::intersect_range_factor_range(const isl::basic_map &factor) const +{ + return this->intersect_range_factor_range(isl::map(factor)); +} + boolean map::involves_dims(isl::dim type, unsigned int first, unsigned int n) const { auto res = isl_map_involves_dims(get(), static_cast(type), first, n); @@ -8722,6 +11277,16 @@ boolean map::is_disjoint(const isl::map &map2) const return manage(res); } +boolean map::is_disjoint(const isl::union_map &umap2) const +{ + return isl::union_map(*this).is_disjoint(umap2); +} + +boolean map::is_disjoint(const isl::basic_map &map2) const +{ + return this->is_disjoint(isl::map(map2)); +} + boolean map::is_empty() const { auto res = isl_map_is_empty(get()); @@ -8734,21 +11299,19 @@ boolean map::is_equal(const isl::map &map2) const return manage(res); } -boolean map::is_identity() const +boolean map::is_equal(const isl::union_map &umap2) const { - auto res = isl_map_is_identity(get()); - return manage(res); + return isl::union_map(*this).is_equal(umap2); } -boolean map::is_injective() const +boolean map::is_equal(const isl::basic_map &map2) const { - auto res = isl_map_is_injective(get()); - return manage(res); + return this->is_equal(isl::map(map2)); } -boolean map::is_product() const +boolean map::is_injective() const { - auto res = isl_map_is_product(get()); + auto res = isl_map_is_injective(get()); return manage(res); } @@ -8764,16 +11327,35 @@ boolean map::is_strict_subset(const isl::map &map2) const return manage(res); } +boolean map::is_strict_subset(const isl::union_map &umap2) const +{ + return isl::union_map(*this).is_strict_subset(umap2); +} + +boolean map::is_strict_subset(const isl::basic_map &map2) const +{ + return this->is_strict_subset(isl::map(map2)); +} + boolean map::is_subset(const isl::map &map2) const { auto res = isl_map_is_subset(get(), map2.get()); return manage(res); } -int map::is_translation() const +boolean map::is_subset(const isl::union_map &umap2) const +{ + return isl::union_map(*this).is_subset(umap2); +} + +boolean map::is_subset(const isl::basic_map &map2) const +{ + return this->is_subset(isl::map(map2)); +} + +boolean map::isa_map() const { - auto res = isl_map_is_translation(get()); - return res; + return isl::union_map(*this).isa_map(); } isl::map map::lex_ge(isl::space set_space) @@ -8788,18 +11370,6 @@ isl::map map::lex_ge_at(isl::multi_pw_aff mpa) const return manage(res); } -isl::map map::lex_ge_first(isl::space space, unsigned int n) -{ - auto res = isl_map_lex_ge_first(space.release(), n); - return manage(res); -} - -isl::map map::lex_ge_map(isl::map map2) const -{ - auto res = isl_map_lex_ge_map(copy(), map2.release()); - return manage(res); -} - isl::map map::lex_gt(isl::space set_space) { auto res = isl_map_lex_gt(set_space.release()); @@ -8812,18 +11382,6 @@ isl::map map::lex_gt_at(isl::multi_pw_aff mpa) const return manage(res); } -isl::map map::lex_gt_first(isl::space space, unsigned int n) -{ - auto res = isl_map_lex_gt_first(space.release(), n); - return manage(res); -} - -isl::map map::lex_gt_map(isl::map map2) const -{ - auto res = isl_map_lex_gt_map(copy(), map2.release()); - return manage(res); -} - isl::map map::lex_le(isl::space set_space) { auto res = isl_map_lex_le(set_space.release()); @@ -8836,18 +11394,6 @@ isl::map map::lex_le_at(isl::multi_pw_aff mpa) const return manage(res); } -isl::map map::lex_le_first(isl::space space, unsigned int n) -{ - auto res = isl_map_lex_le_first(space.release(), n); - return manage(res); -} - -isl::map map::lex_le_map(isl::map map2) const -{ - auto res = isl_map_lex_le_map(copy(), map2.release()); - return manage(res); -} - isl::map map::lex_lt(isl::space set_space) { auto res = isl_map_lex_lt(set_space.release()); @@ -8860,18 +11406,6 @@ isl::map map::lex_lt_at(isl::multi_pw_aff mpa) const return manage(res); } -isl::map map::lex_lt_first(isl::space space, unsigned int n) -{ - auto res = isl_map_lex_lt_first(space.release(), n); - return manage(res); -} - -isl::map map::lex_lt_map(isl::map map2) const -{ - auto res = isl_map_lex_lt_map(copy(), map2.release()); - return manage(res); -} - isl::map map::lexmax() const { auto res = isl_map_lexmax(copy()); @@ -8908,10 +11442,9 @@ isl::map map::lower_bound_si(isl::dim type, unsigned int pos, int value) const return manage(res); } -isl::map map::lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const +isl::map_list map::map_list() const { - auto res = isl_map_lower_bound_val(copy(), static_cast(type), pos, value.release()); - return manage(res); + return isl::union_map(*this).map_list(); } isl::multi_pw_aff map::max_multi_pw_aff() const @@ -8932,310 +11465,291 @@ isl::map map::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_ty return manage(res); } -isl_size map::n_basic_map() const +class size map::n_basic_map() const { auto res = isl_map_n_basic_map(get()); - return res; + return manage(res); } -isl::map map::nat_universe(isl::space space) +isl::map map::order_lt(isl::dim type1, int pos1, isl::dim type2, int pos2) const { - auto res = isl_map_nat_universe(space.release()); + auto res = isl_map_order_lt(copy(), static_cast(type1), pos1, static_cast(type2), pos2); return manage(res); } -isl::map map::neg() const +isl::set map::params() const { - auto res = isl_map_neg(copy()); - return manage(res); + return isl::union_map(*this).params(); } -isl::map map::oppose(isl::dim type1, int pos1, isl::dim type2, int pos2) const +isl::basic_map map::polyhedral_hull() const { - auto res = isl_map_oppose(copy(), static_cast(type1), pos1, static_cast(type2), pos2); + auto res = isl_map_polyhedral_hull(copy()); return manage(res); } -isl::map map::order_ge(isl::dim type1, int pos1, isl::dim type2, int pos2) const +isl::map map::preimage_domain(isl::multi_aff ma) const { - auto res = isl_map_order_ge(copy(), static_cast(type1), pos1, static_cast(type2), pos2); + auto res = isl_map_preimage_domain_multi_aff(copy(), ma.release()); return manage(res); } -isl::map map::order_gt(isl::dim type1, int pos1, isl::dim type2, int pos2) const +isl::map map::preimage_domain(isl::multi_pw_aff mpa) const { - auto res = isl_map_order_gt(copy(), static_cast(type1), pos1, static_cast(type2), pos2); + auto res = isl_map_preimage_domain_multi_pw_aff(copy(), mpa.release()); return manage(res); } -isl::map map::order_le(isl::dim type1, int pos1, isl::dim type2, int pos2) const +isl::map map::preimage_domain(isl::pw_multi_aff pma) const { - auto res = isl_map_order_le(copy(), static_cast(type1), pos1, static_cast(type2), pos2); + auto res = isl_map_preimage_domain_pw_multi_aff(copy(), pma.release()); return manage(res); } -isl::map map::order_lt(isl::dim type1, int pos1, isl::dim type2, int pos2) const +isl::union_map map::preimage_domain(const isl::union_pw_multi_aff &upma) const { - auto res = isl_map_order_lt(copy(), static_cast(type1), pos1, static_cast(type2), pos2); - return manage(res); + return isl::union_map(*this).preimage_domain(upma); } -isl::set map::params() const +isl::map map::preimage_range(isl::multi_aff ma) const { - auto res = isl_map_params(copy()); + auto res = isl_map_preimage_range_multi_aff(copy(), ma.release()); return manage(res); } -isl::val map::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const +isl::map map::preimage_range(isl::pw_multi_aff pma) const { - auto res = isl_map_plain_get_val_if_fixed(get(), static_cast(type), pos); + auto res = isl_map_preimage_range_pw_multi_aff(copy(), pma.release()); return manage(res); } -boolean map::plain_is_empty() const +isl::union_map map::preimage_range(const isl::union_pw_multi_aff &upma) const { - auto res = isl_map_plain_is_empty(get()); - return manage(res); + return isl::union_map(*this).preimage_range(upma); } -boolean map::plain_is_equal(const isl::map &map2) const +isl::map map::product(isl::map map2) const { - auto res = isl_map_plain_is_equal(get(), map2.get()); + auto res = isl_map_product(copy(), map2.release()); return manage(res); } -boolean map::plain_is_injective() const +isl::union_map map::product(const isl::union_map &umap2) const { - auto res = isl_map_plain_is_injective(get()); - return manage(res); + return isl::union_map(*this).product(umap2); } -boolean map::plain_is_single_valued() const +isl::map map::product(const isl::basic_map &map2) const { - auto res = isl_map_plain_is_single_valued(get()); - return manage(res); + return this->product(isl::map(map2)); } -boolean map::plain_is_universe() const +isl::map map::project_out(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_map_plain_is_universe(get()); + auto res = isl_map_project_out(copy(), static_cast(type), first, n); return manage(res); } -isl::basic_map map::plain_unshifted_simple_hull() const +isl::map map::project_out_all_params() const { - auto res = isl_map_plain_unshifted_simple_hull(copy()); + auto res = isl_map_project_out_all_params(copy()); return manage(res); } -isl::basic_map map::polyhedral_hull() const +isl::set map::range() const { - auto res = isl_map_polyhedral_hull(copy()); + auto res = isl_map_range(copy()); return manage(res); } -isl::map map::preimage_domain(isl::multi_aff ma) const +isl::map map::range_factor_domain() const { - auto res = isl_map_preimage_domain_multi_aff(copy(), ma.release()); + auto res = isl_map_range_factor_domain(copy()); return manage(res); } -isl::map map::preimage_domain(isl::multi_pw_aff mpa) const +isl::map map::range_factor_range() const { - auto res = isl_map_preimage_domain_multi_pw_aff(copy(), mpa.release()); + auto res = isl_map_range_factor_range(copy()); return manage(res); } -isl::map map::preimage_domain(isl::pw_multi_aff pma) const +isl::fixed_box map::range_lattice_tile() const { - auto res = isl_map_preimage_domain_pw_multi_aff(copy(), pma.release()); + auto res = isl_map_get_range_lattice_tile(get()); return manage(res); } -isl::map map::preimage_range(isl::multi_aff ma) const +isl::fixed_box map::get_range_lattice_tile() const { - auto res = isl_map_preimage_range_multi_aff(copy(), ma.release()); - return manage(res); + return range_lattice_tile(); } -isl::map map::preimage_range(isl::pw_multi_aff pma) const +isl::map map::range_map() const { - auto res = isl_map_preimage_range_pw_multi_aff(copy(), pma.release()); + auto res = isl_map_range_map(copy()); return manage(res); } -isl::map map::product(isl::map map2) const +isl::map map::range_product(isl::map map2) const { - auto res = isl_map_product(copy(), map2.release()); + auto res = isl_map_range_product(copy(), map2.release()); return manage(res); } -isl::map map::project_out(isl::dim type, unsigned int first, unsigned int n) const +isl::union_map map::range_product(const isl::union_map &umap2) const { - auto res = isl_map_project_out(copy(), static_cast(type), first, n); - return manage(res); + return isl::union_map(*this).range_product(umap2); } -isl::map map::project_out_all_params() const +isl::map map::range_product(const isl::basic_map &map2) const { - auto res = isl_map_project_out_all_params(copy()); - return manage(res); + return this->range_product(isl::map(map2)); } -isl::set map::range() const +isl::map map::range_reverse() const { - auto res = isl_map_range(copy()); + auto res = isl_map_range_reverse(copy()); return manage(res); } -isl::map map::range_curry() const +isl::fixed_box map::range_simple_fixed_box_hull() const { - auto res = isl_map_range_curry(copy()); + auto res = isl_map_get_range_simple_fixed_box_hull(get()); return manage(res); } -isl::map map::range_factor_domain() const +isl::fixed_box map::get_range_simple_fixed_box_hull() const { - auto res = isl_map_range_factor_domain(copy()); - return manage(res); + return range_simple_fixed_box_hull(); } -isl::map map::range_factor_range() const +class size map::range_tuple_dim() const { - auto res = isl_map_range_factor_range(copy()); + auto res = isl_map_range_tuple_dim(get()); return manage(res); } -boolean map::range_is_wrapping() const +isl::id map::range_tuple_id() const { - auto res = isl_map_range_is_wrapping(get()); + auto res = isl_map_get_range_tuple_id(get()); return manage(res); } -isl::map map::range_map() const +isl::id map::get_range_tuple_id() const { - auto res = isl_map_range_map(copy()); - return manage(res); + return range_tuple_id(); } -isl::map map::range_product(isl::map map2) const +isl::map map::reverse() const { - auto res = isl_map_range_product(copy(), map2.release()); + auto res = isl_map_reverse(copy()); return manage(res); } -isl::map map::range_reverse() const +isl::basic_map map::sample() const { - auto res = isl_map_range_reverse(copy()); + auto res = isl_map_sample(copy()); return manage(res); } -isl_size map::range_tuple_dim() const +isl::map map::set_domain_tuple(isl::id id) const { - auto res = isl_map_range_tuple_dim(get()); - return res; + auto res = isl_map_set_domain_tuple_id(copy(), id.release()); + return manage(res); } -isl::map map::remove_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::map map::set_domain_tuple(const std::string &id) const { - auto res = isl_map_remove_dims(copy(), static_cast(type), first, n); - return manage(res); + return this->set_domain_tuple(isl::id(ctx(), id)); } -isl::map map::remove_divs() const +isl::map map::set_range_tuple(isl::id id) const { - auto res = isl_map_remove_divs(copy()); + auto res = isl_map_set_range_tuple_id(copy(), id.release()); return manage(res); } -isl::map map::remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::map map::set_range_tuple(const std::string &id) const { - auto res = isl_map_remove_divs_involving_dims(copy(), static_cast(type), first, n); - return manage(res); + return this->set_range_tuple(isl::id(ctx(), id)); } -isl::map map::remove_redundancies() const +isl::map map::set_tuple_id(isl::dim type, isl::id id) const { - auto res = isl_map_remove_redundancies(copy()); + auto res = isl_map_set_tuple_id(copy(), static_cast(type), id.release()); return manage(res); } -isl::map map::remove_unknown_divs() const +isl::map map::set_tuple_id(isl::dim type, const std::string &id) const { - auto res = isl_map_remove_unknown_divs(copy()); - return manage(res); + return this->set_tuple_id(type, isl::id(ctx(), id)); } -isl::map map::reset_tuple_id(isl::dim type) const +isl::space map::space() const { - auto res = isl_map_reset_tuple_id(copy(), static_cast(type)); + auto res = isl_map_get_space(get()); return manage(res); } -isl::map map::reset_user() const +isl::space map::get_space() const { - auto res = isl_map_reset_user(copy()); - return manage(res); + return space(); } -isl::map map::reverse() const +isl::map map::subtract(isl::map map2) const { - auto res = isl_map_reverse(copy()); + auto res = isl_map_subtract(copy(), map2.release()); return manage(res); } -isl::basic_map map::sample() const +isl::union_map map::subtract(const isl::union_map &umap2) const { - auto res = isl_map_sample(copy()); - return manage(res); + return isl::union_map(*this).subtract(umap2); } -isl::map map::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const +isl::map map::subtract(const isl::basic_map &map2) const { - auto res = isl_map_set_dim_id(copy(), static_cast(type), pos, id.release()); - return manage(res); + return this->subtract(isl::map(map2)); } -isl::map map::set_tuple_id(isl::dim type, isl::id id) const +isl::union_map map::subtract_domain(const isl::union_set &dom) const { - auto res = isl_map_set_tuple_id(copy(), static_cast(type), id.release()); - return manage(res); + return isl::union_map(*this).subtract_domain(dom); } -isl::map map::set_tuple_name(isl::dim type, const std::string &s) const +isl::union_map map::subtract_range(const isl::union_set &dom) const { - auto res = isl_map_set_tuple_name(copy(), static_cast(type), s.c_str()); - return manage(res); + return isl::union_map(*this).subtract_range(dom); } -isl::basic_map map::simple_hull() const +isl::map map::sum(isl::map map2) const { - auto res = isl_map_simple_hull(copy()); + auto res = isl_map_sum(copy(), map2.release()); return manage(res); } -isl::map map::subtract(isl::map map2) const +isl::map_list map::to_list() const { - auto res = isl_map_subtract(copy(), map2.release()); + auto res = isl_map_to_list(copy()); return manage(res); } -isl::map map::subtract_domain(isl::set dom) const +isl::union_map map::to_union_map() const { - auto res = isl_map_subtract_domain(copy(), dom.release()); + auto res = isl_map_to_union_map(copy()); return manage(res); } -isl::map map::subtract_range(isl::set dom) const +isl::id map::tuple_id(isl::dim type) const { - auto res = isl_map_subtract_range(copy(), dom.release()); + auto res = isl_map_get_tuple_id(get(), static_cast(type)); return manage(res); } -isl::map map::sum(isl::map map2) const +isl::id map::get_tuple_id(isl::dim type) const { - auto res = isl_map_sum(copy(), map2.release()); - return manage(res); + return tuple_id(type); } isl::map map::uncurry() const @@ -9250,6 +11764,16 @@ isl::map map::unite(isl::map map2) const return manage(res); } +isl::union_map map::unite(const isl::union_map &umap2) const +{ + return isl::union_map(*this).unite(umap2); +} + +isl::map map::unite(const isl::basic_map &map2) const +{ + return this->unite(isl::map(map2)); +} + isl::map map::universe(isl::space space) { auto res = isl_map_universe(space.release()); @@ -9262,12 +11786,6 @@ isl::basic_map map::unshifted_simple_hull() const return manage(res); } -isl::basic_map map::unshifted_simple_hull_from_map_list(isl::map_list list) const -{ - auto res = isl_map_unshifted_simple_hull_from_map_list(copy(), list.release()); - return manage(res); -} - isl::map map::upper_bound(isl::multi_pw_aff upper) const { auto res = isl_map_upper_bound_multi_pw_aff(copy(), upper.release()); @@ -9280,12 +11798,6 @@ isl::map map::upper_bound_si(isl::dim type, unsigned int pos, int value) const return manage(res); } -isl::map map::upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const -{ - auto res = isl_map_upper_bound_val(copy(), static_cast(type), pos, value.release()); - return manage(res); -} - isl::set map::wrap() const { auto res = isl_map_wrap(copy()); @@ -9298,6 +11810,18 @@ isl::map map::zip() const return manage(res); } +inline std::ostream &operator<<(std::ostream &os, const map &obj) +{ + char *str = isl_map_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} + // implementations for isl::map_list map_list manage(__isl_take isl_map_list *ptr) { return map_list(ptr); @@ -9316,10 +11840,26 @@ map_list::map_list(const map_list &obj) ptr = obj.copy(); } - map_list::map_list(__isl_take isl_map_list *ptr) : ptr(ptr) {} +map_list::map_list(isl::ctx ctx, int n) +{ + auto res = isl_map_list_alloc(ctx.release(), n); + ptr = res; +} + +map_list::map_list(isl::map el) +{ + auto res = isl_map_list_from_map(el.release()); + ptr = res; +} + +map_list::map_list(isl::ctx ctx, const std::string &str) +{ + auto res = isl_map_list_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} map_list &map_list::operator=(map_list obj) { std::swap(this->ptr, obj.ptr); @@ -9349,28 +11889,27 @@ bool map_list::is_null() const { return ptr == nullptr; } - isl::ctx map_list::ctx() const { return isl::ctx(isl_map_list_get_ctx(ptr)); } -void map_list::dump() const { - isl_map_list_dump(get()); -} - - isl::map_list map_list::add(isl::map el) const { auto res = isl_map_list_add(copy(), el.release()); return manage(res); } -isl::map_list map_list::alloc(isl::ctx ctx, int n) +isl::map map_list::at(int index) const { - auto res = isl_map_list_alloc(ctx.release(), n); + auto res = isl_map_list_get_at(get(), index); return manage(res); } +isl::map map_list::get_at(int index) const +{ + return at(index); +} + isl::map_list map_list::clear() const { auto res = isl_map_list_clear(copy()); @@ -9389,894 +11928,845 @@ isl::map_list map_list::drop(unsigned int first, unsigned int n) const return manage(res); } -stat map_list::foreach(const std::function &fn) const +stat map_list::foreach(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; + std::function func; + } fn_data = { fn }; auto fn_lambda = [](isl_map *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); + auto ret = (data->func)(manage(arg_0)); return ret.release(); }; auto res = isl_map_list_foreach(get(), fn_lambda, &fn_data); return manage(res); } -isl::map_list map_list::from_map(isl::map el) +isl::map_list map_list::insert(unsigned int pos, isl::map el) const { - auto res = isl_map_list_from_map(el.release()); + auto res = isl_map_list_insert(copy(), pos, el.release()); return manage(res); } -isl::map map_list::get_at(int index) const +class size map_list::size() const { - auto res = isl_map_list_get_at(get(), index); + auto res = isl_map_list_size(get()); return manage(res); } -isl::map map_list::get_map(int index) const +inline std::ostream &operator<<(std::ostream &os, const map_list &obj) { - auto res = isl_map_list_get_map(get(), index); - return manage(res); + char *str = isl_map_list_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::map_list map_list::insert(unsigned int pos, isl::map el) const -{ - auto res = isl_map_list_insert(copy(), pos, el.release()); - return manage(res); +// implementations for isl::multi_aff +multi_aff manage(__isl_take isl_multi_aff *ptr) { + return multi_aff(ptr); } - -isl_size map_list::n_map() const -{ - auto res = isl_map_list_n_map(get()); - return res; +multi_aff manage_copy(__isl_keep isl_multi_aff *ptr) { + ptr = isl_multi_aff_copy(ptr); + return multi_aff(ptr); } -isl::map_list map_list::reverse() const -{ - auto res = isl_map_list_reverse(copy()); - return manage(res); -} +multi_aff::multi_aff() + : ptr(nullptr) {} -isl::map_list map_list::set_map(int index, isl::map el) const +multi_aff::multi_aff(const multi_aff &obj) + : ptr(nullptr) { - auto res = isl_map_list_set_map(copy(), index, el.release()); - return manage(res); + ptr = obj.copy(); } -isl_size map_list::size() const -{ - auto res = isl_map_list_size(get()); - return res; -} +multi_aff::multi_aff(__isl_take isl_multi_aff *ptr) + : ptr(ptr) {} -isl::map_list map_list::swap(unsigned int pos1, unsigned int pos2) const +multi_aff::multi_aff(isl::aff aff) { - auto res = isl_map_list_swap(copy(), pos1, pos2); - return manage(res); + auto res = isl_multi_aff_from_aff(aff.release()); + ptr = res; } -// implementations for isl::mat -mat manage(__isl_take isl_mat *ptr) { - return mat(ptr); -} -mat manage_copy(__isl_keep isl_mat *ptr) { - ptr = isl_mat_copy(ptr); - return mat(ptr); +multi_aff::multi_aff(isl::space space, isl::aff_list list) +{ + auto res = isl_multi_aff_from_aff_list(space.release(), list.release()); + ptr = res; } -mat::mat() - : ptr(nullptr) {} - -mat::mat(const mat &obj) - : ptr(nullptr) +multi_aff::multi_aff(isl::ctx ctx, const std::string &str) { - ptr = obj.copy(); + auto res = isl_multi_aff_read_from_str(ctx.release(), str.c_str()); + ptr = res; } - -mat::mat(__isl_take isl_mat *ptr) - : ptr(ptr) {} - - -mat &mat::operator=(mat obj) { +multi_aff &multi_aff::operator=(multi_aff obj) { std::swap(this->ptr, obj.ptr); return *this; } -mat::~mat() { +multi_aff::~multi_aff() { if (ptr) - isl_mat_free(ptr); + isl_multi_aff_free(ptr); } -__isl_give isl_mat *mat::copy() const & { - return isl_mat_copy(ptr); +__isl_give isl_multi_aff *multi_aff::copy() const & { + return isl_multi_aff_copy(ptr); } -__isl_keep isl_mat *mat::get() const { +__isl_keep isl_multi_aff *multi_aff::get() const { return ptr; } -__isl_give isl_mat *mat::release() { - isl_mat *tmp = ptr; +__isl_give isl_multi_aff *multi_aff::release() { + isl_multi_aff *tmp = ptr; ptr = nullptr; return tmp; } -bool mat::is_null() const { +bool multi_aff::is_null() const { return ptr == nullptr; } +isl::ctx multi_aff::ctx() const { + return isl::ctx(isl_multi_aff_get_ctx(ptr)); +} -isl::ctx mat::ctx() const { - return isl::ctx(isl_mat_get_ctx(ptr)); +isl::multi_aff multi_aff::add(isl::multi_aff multi2) const +{ + auto res = isl_multi_aff_add(copy(), multi2.release()); + return manage(res); } -void mat::dump() const { - isl_mat_dump(get()); +isl::multi_pw_aff multi_aff::add(const isl::multi_pw_aff &multi2) const +{ + return isl::pw_multi_aff(*this).add(multi2); } +isl::multi_union_pw_aff multi_aff::add(const isl::multi_union_pw_aff &multi2) const +{ + return isl::pw_multi_aff(*this).add(multi2); +} -isl::mat mat::add_rows(unsigned int n) const +isl::pw_multi_aff multi_aff::add(const isl::pw_multi_aff &pma2) const { - auto res = isl_mat_add_rows(copy(), n); - return manage(res); + return isl::pw_multi_aff(*this).add(pma2); } -isl::mat mat::add_zero_cols(unsigned int n) const +isl::union_pw_multi_aff multi_aff::add(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_mat_add_zero_cols(copy(), n); - return manage(res); + return isl::pw_multi_aff(*this).add(upma2); } -isl::mat mat::add_zero_rows(unsigned int n) const +isl::multi_aff multi_aff::add(const isl::aff &multi2) const { - auto res = isl_mat_add_zero_rows(copy(), n); - return manage(res); + return this->add(isl::multi_aff(multi2)); } -isl::mat mat::aff_direct_sum(isl::mat right) const +isl::multi_aff multi_aff::add_constant(isl::multi_val mv) const { - auto res = isl_mat_aff_direct_sum(copy(), right.release()); + auto res = isl_multi_aff_add_constant_multi_val(copy(), mv.release()); return manage(res); } -isl::mat mat::alloc(isl::ctx ctx, unsigned int n_row, unsigned int n_col) +isl::multi_aff multi_aff::add_constant(isl::val v) const { - auto res = isl_mat_alloc(ctx.release(), n_row, n_col); + auto res = isl_multi_aff_add_constant_val(copy(), v.release()); return manage(res); } -isl_size mat::cols() const +isl::multi_aff multi_aff::add_constant(long v) const { - auto res = isl_mat_cols(get()); - return res; + return this->add_constant(isl::val(ctx(), v)); } -isl::mat mat::concat(isl::mat bot) const +isl::union_pw_multi_aff multi_aff::add_pw_multi_aff(const isl::pw_multi_aff &pma) const { - auto res = isl_mat_concat(copy(), bot.release()); - return manage(res); + return isl::pw_multi_aff(*this).add_pw_multi_aff(pma); } -isl::mat mat::diagonal(isl::mat mat2) const +isl::union_pw_multi_aff multi_aff::apply(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_mat_diagonal(copy(), mat2.release()); - return manage(res); + return isl::pw_multi_aff(*this).apply(upma2); } -isl::mat mat::drop_cols(unsigned int col, unsigned int n) const +isl::map multi_aff::as_map() const { - auto res = isl_mat_drop_cols(copy(), col, n); + auto res = isl_multi_aff_as_map(copy()); return manage(res); } -isl::mat mat::drop_rows(unsigned int row, unsigned int n) const +isl::multi_aff multi_aff::as_multi_aff() const { - auto res = isl_mat_drop_rows(copy(), row, n); - return manage(res); + return isl::pw_multi_aff(*this).as_multi_aff(); } -isl::mat mat::from_row_vec(isl::vec vec) +isl::multi_union_pw_aff multi_aff::as_multi_union_pw_aff() const { - auto res = isl_mat_from_row_vec(vec.release()); - return manage(res); + return isl::pw_multi_aff(*this).as_multi_union_pw_aff(); } -isl::val mat::get_element_val(int row, int col) const +isl::pw_multi_aff multi_aff::as_pw_multi_aff() const { - auto res = isl_mat_get_element_val(get(), row, col); - return manage(res); + return isl::pw_multi_aff(*this).as_pw_multi_aff(); } -boolean mat::has_linearly_independent_rows(const isl::mat &mat2) const +isl::set multi_aff::as_set() const { - auto res = isl_mat_has_linearly_independent_rows(get(), mat2.get()); + auto res = isl_multi_aff_as_set(copy()); return manage(res); } -int mat::initial_non_zero_cols() const +isl::union_map multi_aff::as_union_map() const { - auto res = isl_mat_initial_non_zero_cols(get()); - return res; + return isl::pw_multi_aff(*this).as_union_map(); } -isl::mat mat::insert_cols(unsigned int col, unsigned int n) const +isl::aff multi_aff::at(int pos) const { - auto res = isl_mat_insert_cols(copy(), col, n); + auto res = isl_multi_aff_get_at(get(), pos); return manage(res); } -isl::mat mat::insert_rows(unsigned int row, unsigned int n) const +isl::aff multi_aff::get_at(int pos) const { - auto res = isl_mat_insert_rows(copy(), row, n); - return manage(res); + return at(pos); } -isl::mat mat::insert_zero_cols(unsigned int first, unsigned int n) const +isl::basic_set multi_aff::bind(isl::multi_id tuple) const { - auto res = isl_mat_insert_zero_cols(copy(), first, n); + auto res = isl_multi_aff_bind(copy(), tuple.release()); return manage(res); } -isl::mat mat::insert_zero_rows(unsigned int row, unsigned int n) const +isl::multi_aff multi_aff::bind_domain(isl::multi_id tuple) const { - auto res = isl_mat_insert_zero_rows(copy(), row, n); + auto res = isl_multi_aff_bind_domain(copy(), tuple.release()); return manage(res); } -isl::mat mat::inverse_product(isl::mat right) const +isl::multi_aff multi_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const { - auto res = isl_mat_inverse_product(copy(), right.release()); + auto res = isl_multi_aff_bind_domain_wrapped_domain(copy(), tuple.release()); return manage(res); } -boolean mat::is_equal(const isl::mat &mat2) const +isl::pw_multi_aff multi_aff::coalesce() const { - auto res = isl_mat_is_equal(get(), mat2.get()); - return manage(res); + return isl::pw_multi_aff(*this).coalesce(); } -isl::mat mat::lin_to_aff() const +isl::multi_val multi_aff::constant_multi_val() const { - auto res = isl_mat_lin_to_aff(copy()); + auto res = isl_multi_aff_get_constant_multi_val(get()); return manage(res); } -isl::mat mat::move_cols(unsigned int dst_col, unsigned int src_col, unsigned int n) const +isl::multi_val multi_aff::get_constant_multi_val() const { - auto res = isl_mat_move_cols(copy(), dst_col, src_col, n); - return manage(res); + return constant_multi_val(); } -isl::mat mat::normalize() const +class size multi_aff::dim(isl::dim type) const { - auto res = isl_mat_normalize(copy()); + auto res = isl_multi_aff_dim(get(), static_cast(type)); return manage(res); } -isl::mat mat::normalize_row(int row) const +isl::set multi_aff::domain() const { - auto res = isl_mat_normalize_row(copy(), row); - return manage(res); + return isl::pw_multi_aff(*this).domain(); } -isl::mat mat::product(isl::mat right) const +isl::multi_aff multi_aff::domain_map(isl::space space) { - auto res = isl_mat_product(copy(), right.release()); + auto res = isl_multi_aff_domain_map(space.release()); return manage(res); } -isl_size mat::rank() const +isl::pw_multi_aff multi_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_mat_rank(get()); - return res; + return isl::pw_multi_aff(*this).drop_dims(type, first, n); } -isl::mat mat::right_inverse() const +isl::pw_multi_aff multi_aff::extract_pw_multi_aff(const isl::space &space) const { - auto res = isl_mat_right_inverse(copy()); - return manage(res); + return isl::pw_multi_aff(*this).extract_pw_multi_aff(space); } -isl::mat mat::right_kernel() const +isl::multi_aff multi_aff::flat_range_product(isl::multi_aff multi2) const { - auto res = isl_mat_right_kernel(copy()); + auto res = isl_multi_aff_flat_range_product(copy(), multi2.release()); return manage(res); } -isl::mat mat::row_basis() const +isl::multi_pw_aff multi_aff::flat_range_product(const isl::multi_pw_aff &multi2) const { - auto res = isl_mat_row_basis(copy()); - return manage(res); + return isl::pw_multi_aff(*this).flat_range_product(multi2); } -isl::mat mat::row_basis_extension(isl::mat mat2) const +isl::multi_union_pw_aff multi_aff::flat_range_product(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_mat_row_basis_extension(copy(), mat2.release()); - return manage(res); + return isl::pw_multi_aff(*this).flat_range_product(multi2); } -isl_size mat::rows() const +isl::pw_multi_aff multi_aff::flat_range_product(const isl::pw_multi_aff &pma2) const { - auto res = isl_mat_rows(get()); - return res; + return isl::pw_multi_aff(*this).flat_range_product(pma2); } -isl::mat mat::set_element_si(int row, int col, int v) const +isl::union_pw_multi_aff multi_aff::flat_range_product(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_mat_set_element_si(copy(), row, col, v); - return manage(res); + return isl::pw_multi_aff(*this).flat_range_product(upma2); } -isl::mat mat::set_element_val(int row, int col, isl::val v) const +isl::multi_aff multi_aff::flat_range_product(const isl::aff &multi2) const { - auto res = isl_mat_set_element_val(copy(), row, col, v.release()); - return manage(res); + return this->flat_range_product(isl::multi_aff(multi2)); } -isl::mat mat::swap_cols(unsigned int i, unsigned int j) const +isl::multi_aff multi_aff::floor() const { - auto res = isl_mat_swap_cols(copy(), i, j); + auto res = isl_multi_aff_floor(copy()); return manage(res); } -isl::mat mat::swap_rows(unsigned int i, unsigned int j) const +stat multi_aff::foreach_piece(const std::function &fn) const { - auto res = isl_mat_swap_rows(copy(), i, j); - return manage(res); + return isl::pw_multi_aff(*this).foreach_piece(fn); } -isl::mat mat::transpose() const +isl::multi_aff multi_aff::gist(isl::set context) const { - auto res = isl_mat_transpose(copy()); + auto res = isl_multi_aff_gist(copy(), context.release()); return manage(res); } -isl::mat mat::unimodular_complete(int row) const +isl::union_pw_multi_aff multi_aff::gist(const isl::union_set &context) const { - auto res = isl_mat_unimodular_complete(copy(), row); - return manage(res); + return isl::pw_multi_aff(*this).gist(context); } -isl::mat mat::vec_concat(isl::vec bot) const +isl::multi_aff multi_aff::gist(const isl::basic_set &context) const { - auto res = isl_mat_vec_concat(copy(), bot.release()); - return manage(res); + return this->gist(isl::set(context)); } -isl::vec mat::vec_inverse_product(isl::vec vec) const +isl::multi_aff multi_aff::gist(const isl::point &context) const { - auto res = isl_mat_vec_inverse_product(copy(), vec.release()); + return this->gist(isl::set(context)); +} + +boolean multi_aff::has_range_tuple_id() const +{ + auto res = isl_multi_aff_has_range_tuple_id(get()); return manage(res); } -isl::vec mat::vec_product(isl::vec vec) const +isl::multi_aff multi_aff::identity(isl::space space) { - auto res = isl_mat_vec_product(copy(), vec.release()); + auto res = isl_multi_aff_identity(space.release()); return manage(res); } -// implementations for isl::multi_aff -multi_aff manage(__isl_take isl_multi_aff *ptr) { - return multi_aff(ptr); +isl::multi_aff multi_aff::identity() const +{ + auto res = isl_multi_aff_identity_multi_aff(copy()); + return manage(res); } -multi_aff manage_copy(__isl_keep isl_multi_aff *ptr) { - ptr = isl_multi_aff_copy(ptr); - return multi_aff(ptr); + +isl::multi_aff multi_aff::identity_on_domain(isl::space space) +{ + auto res = isl_multi_aff_identity_on_domain_space(space.release()); + return manage(res); } -multi_aff::multi_aff() - : ptr(nullptr) {} +isl::multi_aff multi_aff::insert_domain(isl::space domain) const +{ + auto res = isl_multi_aff_insert_domain(copy(), domain.release()); + return manage(res); +} -multi_aff::multi_aff(const multi_aff &obj) - : ptr(nullptr) +isl::pw_multi_aff multi_aff::intersect_domain(const isl::set &set) const { - ptr = obj.copy(); + return isl::pw_multi_aff(*this).intersect_domain(set); } +isl::union_pw_multi_aff multi_aff::intersect_domain(const isl::space &space) const +{ + return isl::pw_multi_aff(*this).intersect_domain(space); +} -multi_aff::multi_aff(__isl_take isl_multi_aff *ptr) - : ptr(ptr) {} +isl::union_pw_multi_aff multi_aff::intersect_domain(const isl::union_set &uset) const +{ + return isl::pw_multi_aff(*this).intersect_domain(uset); +} -multi_aff::multi_aff(isl::aff aff) +isl::union_pw_multi_aff multi_aff::intersect_domain_wrapped_domain(const isl::union_set &uset) const { - auto res = isl_multi_aff_from_aff(aff.release()); - ptr = res; + return isl::pw_multi_aff(*this).intersect_domain_wrapped_domain(uset); } -multi_aff::multi_aff(isl::space space, isl::aff_list list) + +isl::union_pw_multi_aff multi_aff::intersect_domain_wrapped_range(const isl::union_set &uset) const { - auto res = isl_multi_aff_from_aff_list(space.release(), list.release()); - ptr = res; + return isl::pw_multi_aff(*this).intersect_domain_wrapped_range(uset); } -multi_aff::multi_aff(isl::ctx ctx, const std::string &str) + +isl::pw_multi_aff multi_aff::intersect_params(const isl::set &set) const { - auto res = isl_multi_aff_read_from_str(ctx.release(), str.c_str()); - ptr = res; + return isl::pw_multi_aff(*this).intersect_params(set); } -multi_aff &multi_aff::operator=(multi_aff obj) { - std::swap(this->ptr, obj.ptr); - return *this; +boolean multi_aff::involves_locals() const +{ + auto res = isl_multi_aff_involves_locals(get()); + return manage(res); } -multi_aff::~multi_aff() { - if (ptr) - isl_multi_aff_free(ptr); +boolean multi_aff::involves_nan() const +{ + auto res = isl_multi_aff_involves_nan(get()); + return manage(res); } -__isl_give isl_multi_aff *multi_aff::copy() const & { - return isl_multi_aff_copy(ptr); +boolean multi_aff::involves_param(const isl::id &id) const +{ + return isl::pw_multi_aff(*this).involves_param(id); } -__isl_keep isl_multi_aff *multi_aff::get() const { - return ptr; +boolean multi_aff::involves_param(const std::string &id) const +{ + return this->involves_param(isl::id(ctx(), id)); } -__isl_give isl_multi_aff *multi_aff::release() { - isl_multi_aff *tmp = ptr; - ptr = nullptr; - return tmp; +boolean multi_aff::involves_param(const isl::id_list &list) const +{ + return isl::pw_multi_aff(*this).involves_param(list); } -bool multi_aff::is_null() const { - return ptr == nullptr; +boolean multi_aff::isa_multi_aff() const +{ + return isl::pw_multi_aff(*this).isa_multi_aff(); } +boolean multi_aff::isa_pw_multi_aff() const +{ + return isl::pw_multi_aff(*this).isa_pw_multi_aff(); +} -isl::ctx multi_aff::ctx() const { - return isl::ctx(isl_multi_aff_get_ctx(ptr)); +isl::aff_list multi_aff::list() const +{ + auto res = isl_multi_aff_get_list(get()); + return manage(res); } -void multi_aff::dump() const { - isl_multi_aff_dump(get()); +isl::aff_list multi_aff::get_list() const +{ + return list(); } +isl::multi_pw_aff multi_aff::max(const isl::multi_pw_aff &multi2) const +{ + return isl::pw_multi_aff(*this).max(multi2); +} -isl::multi_aff multi_aff::add(isl::multi_aff multi2) const +isl::multi_val multi_aff::max_multi_val() const { - auto res = isl_multi_aff_add(copy(), multi2.release()); - return manage(res); + return isl::pw_multi_aff(*this).max_multi_val(); } -isl::multi_aff multi_aff::add_constant(isl::multi_val mv) const +isl::multi_pw_aff multi_aff::min(const isl::multi_pw_aff &multi2) const { - auto res = isl_multi_aff_add_constant_multi_val(copy(), mv.release()); - return manage(res); + return isl::pw_multi_aff(*this).min(multi2); } -isl::multi_aff multi_aff::add_constant(isl::val v) const +isl::multi_val multi_aff::min_multi_val() const { - auto res = isl_multi_aff_add_constant_val(copy(), v.release()); - return manage(res); + return isl::pw_multi_aff(*this).min_multi_val(); } -isl::multi_aff multi_aff::add_dims(isl::dim type, unsigned int n) const +isl::multi_aff multi_aff::multi_val_on_domain(isl::space space, isl::multi_val mv) { - auto res = isl_multi_aff_add_dims(copy(), static_cast(type), n); + auto res = isl_multi_aff_multi_val_on_domain_space(space.release(), mv.release()); return manage(res); } -isl::multi_aff multi_aff::align_params(isl::space model) const +class size multi_aff::n_piece() const { - auto res = isl_multi_aff_align_params(copy(), model.release()); - return manage(res); + return isl::pw_multi_aff(*this).n_piece(); } -isl::basic_set multi_aff::bind(isl::multi_id tuple) const +isl::multi_aff multi_aff::neg() const { - auto res = isl_multi_aff_bind(copy(), tuple.release()); + auto res = isl_multi_aff_neg(copy()); return manage(res); } -isl::multi_aff multi_aff::bind_domain(isl::multi_id tuple) const +boolean multi_aff::plain_is_empty() const { - auto res = isl_multi_aff_bind_domain(copy(), tuple.release()); - return manage(res); + return isl::pw_multi_aff(*this).plain_is_empty(); } -isl::multi_aff multi_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const +boolean multi_aff::plain_is_equal(const isl::multi_aff &multi2) const { - auto res = isl_multi_aff_bind_domain_wrapped_domain(copy(), tuple.release()); + auto res = isl_multi_aff_plain_is_equal(get(), multi2.get()); return manage(res); } -isl_size multi_aff::dim(isl::dim type) const +boolean multi_aff::plain_is_equal(const isl::multi_pw_aff &multi2) const { - auto res = isl_multi_aff_dim(get(), static_cast(type)); - return res; + return isl::pw_multi_aff(*this).plain_is_equal(multi2); } -isl::multi_aff multi_aff::domain_map(isl::space space) +boolean multi_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_multi_aff_domain_map(space.release()); - return manage(res); + return isl::pw_multi_aff(*this).plain_is_equal(multi2); } -isl::multi_aff multi_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +boolean multi_aff::plain_is_equal(const isl::aff &multi2) const { - auto res = isl_multi_aff_drop_dims(copy(), static_cast(type), first, n); - return manage(res); + return this->plain_is_equal(isl::multi_aff(multi2)); } -isl::multi_aff multi_aff::factor_range() const +isl::pw_multi_aff multi_aff::preimage_domain_wrapped_domain(const isl::pw_multi_aff &pma2) const { - auto res = isl_multi_aff_factor_range(copy()); - return manage(res); + return isl::pw_multi_aff(*this).preimage_domain_wrapped_domain(pma2); } -int multi_aff::find_dim_by_id(isl::dim type, const isl::id &id) const +isl::union_pw_multi_aff multi_aff::preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_multi_aff_find_dim_by_id(get(), static_cast(type), id.get()); - return res; + return isl::pw_multi_aff(*this).preimage_domain_wrapped_domain(upma2); } -int multi_aff::find_dim_by_name(isl::dim type, const std::string &name) const +isl::multi_aff multi_aff::product(isl::multi_aff multi2) const { - auto res = isl_multi_aff_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + auto res = isl_multi_aff_product(copy(), multi2.release()); + return manage(res); } -isl::multi_aff multi_aff::flat_range_product(isl::multi_aff multi2) const +isl::multi_pw_aff multi_aff::product(const isl::multi_pw_aff &multi2) const { - auto res = isl_multi_aff_flat_range_product(copy(), multi2.release()); - return manage(res); + return isl::pw_multi_aff(*this).product(multi2); } -isl::multi_aff multi_aff::flatten_domain() const +isl::pw_multi_aff multi_aff::product(const isl::pw_multi_aff &pma2) const { - auto res = isl_multi_aff_flatten_domain(copy()); - return manage(res); + return isl::pw_multi_aff(*this).product(pma2); } -isl::multi_aff multi_aff::flatten_range() const +isl::multi_aff multi_aff::product(const isl::aff &multi2) const { - auto res = isl_multi_aff_flatten_range(copy()); - return manage(res); + return this->product(isl::multi_aff(multi2)); } -isl::multi_aff multi_aff::floor() const +isl::multi_aff multi_aff::pullback(isl::multi_aff ma2) const { - auto res = isl_multi_aff_floor(copy()); + auto res = isl_multi_aff_pullback_multi_aff(copy(), ma2.release()); return manage(res); } -isl::multi_aff multi_aff::from_range() const +isl::multi_pw_aff multi_aff::pullback(const isl::multi_pw_aff &mpa2) const { - auto res = isl_multi_aff_from_range(copy()); - return manage(res); + return isl::pw_multi_aff(*this).pullback(mpa2); } -isl::aff multi_aff::get_aff(int pos) const +isl::pw_multi_aff multi_aff::pullback(const isl::pw_multi_aff &pma2) const { - auto res = isl_multi_aff_get_aff(get(), pos); - return manage(res); + return isl::pw_multi_aff(*this).pullback(pma2); } -isl::aff multi_aff::get_at(int pos) const +isl::union_pw_multi_aff multi_aff::pullback(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_multi_aff_get_at(get(), pos); - return manage(res); + return isl::pw_multi_aff(*this).pullback(upma2); } -isl::multi_val multi_aff::get_constant_multi_val() const +isl::multi_aff multi_aff::pullback(const isl::aff &ma2) const { - auto res = isl_multi_aff_get_constant_multi_val(get()); - return manage(res); + return this->pullback(isl::multi_aff(ma2)); } -isl::id multi_aff::get_dim_id(isl::dim type, unsigned int pos) const +isl::pw_multi_aff_list multi_aff::pw_multi_aff_list() const { - auto res = isl_multi_aff_get_dim_id(get(), static_cast(type), pos); - return manage(res); + return isl::pw_multi_aff(*this).pw_multi_aff_list(); } -isl::space multi_aff::get_domain_space() const +isl::pw_multi_aff multi_aff::range_factor_domain() const { - auto res = isl_multi_aff_get_domain_space(get()); - return manage(res); + return isl::pw_multi_aff(*this).range_factor_domain(); } -isl::aff_list multi_aff::get_list() const +isl::pw_multi_aff multi_aff::range_factor_range() const { - auto res = isl_multi_aff_get_list(get()); - return manage(res); + return isl::pw_multi_aff(*this).range_factor_range(); } -isl::space multi_aff::get_space() const +isl::multi_aff multi_aff::range_map(isl::space space) { - auto res = isl_multi_aff_get_space(get()); + auto res = isl_multi_aff_range_map(space.release()); return manage(res); } -isl::id multi_aff::get_tuple_id(isl::dim type) const +isl::multi_aff multi_aff::range_product(isl::multi_aff multi2) const { - auto res = isl_multi_aff_get_tuple_id(get(), static_cast(type)); + auto res = isl_multi_aff_range_product(copy(), multi2.release()); return manage(res); } -std::string multi_aff::get_tuple_name(isl::dim type) const +isl::multi_pw_aff multi_aff::range_product(const isl::multi_pw_aff &multi2) const { - auto res = isl_multi_aff_get_tuple_name(get(), static_cast(type)); - std::string tmp(res); - return tmp; + return isl::pw_multi_aff(*this).range_product(multi2); } -isl::multi_aff multi_aff::gist(isl::set context) const +isl::multi_union_pw_aff multi_aff::range_product(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_multi_aff_gist(copy(), context.release()); - return manage(res); + return isl::pw_multi_aff(*this).range_product(multi2); } -isl::multi_aff multi_aff::gist_params(isl::set context) const +isl::pw_multi_aff multi_aff::range_product(const isl::pw_multi_aff &pma2) const { - auto res = isl_multi_aff_gist_params(copy(), context.release()); - return manage(res); + return isl::pw_multi_aff(*this).range_product(pma2); } -boolean multi_aff::has_tuple_id(isl::dim type) const +isl::union_pw_multi_aff multi_aff::range_product(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_multi_aff_has_tuple_id(get(), static_cast(type)); - return manage(res); + return isl::pw_multi_aff(*this).range_product(upma2); } -isl::multi_aff multi_aff::identity(isl::space space) +isl::multi_aff multi_aff::range_product(const isl::aff &multi2) const { - auto res = isl_multi_aff_identity(space.release()); - return manage(res); + return this->range_product(isl::multi_aff(multi2)); } -isl::multi_aff multi_aff::identity() const +isl::id multi_aff::range_tuple_id() const { - auto res = isl_multi_aff_identity_multi_aff(copy()); + auto res = isl_multi_aff_get_range_tuple_id(get()); return manage(res); } -isl::multi_aff multi_aff::identity_on_domain(isl::space space) +isl::id multi_aff::get_range_tuple_id() const { - auto res = isl_multi_aff_identity_on_domain_space(space.release()); - return manage(res); + return range_tuple_id(); } -isl::multi_aff multi_aff::insert_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::multi_aff multi_aff::reset_range_tuple_id() const { - auto res = isl_multi_aff_insert_dims(copy(), static_cast(type), first, n); + auto res = isl_multi_aff_reset_range_tuple_id(copy()); return manage(res); } -isl::multi_aff multi_aff::insert_domain(isl::space domain) const +isl::multi_aff multi_aff::reset_tuple_id(isl::dim type) const { - auto res = isl_multi_aff_insert_domain(copy(), domain.release()); + auto res = isl_multi_aff_reset_tuple_id(copy(), static_cast(type)); return manage(res); } -boolean multi_aff::involves_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::multi_aff multi_aff::scale(isl::multi_val mv) const { - auto res = isl_multi_aff_involves_dims(get(), static_cast(type), first, n); + auto res = isl_multi_aff_scale_multi_val(copy(), mv.release()); return manage(res); } -boolean multi_aff::involves_locals() const +isl::multi_aff multi_aff::scale(isl::val v) const { - auto res = isl_multi_aff_involves_locals(get()); + auto res = isl_multi_aff_scale_val(copy(), v.release()); return manage(res); } -boolean multi_aff::involves_nan() const +isl::multi_aff multi_aff::scale(long v) const { - auto res = isl_multi_aff_involves_nan(get()); - return manage(res); + return this->scale(isl::val(ctx(), v)); } -isl::set multi_aff::lex_ge_set(isl::multi_aff ma2) const +isl::multi_aff multi_aff::scale_down(isl::multi_val mv) const { - auto res = isl_multi_aff_lex_ge_set(copy(), ma2.release()); + auto res = isl_multi_aff_scale_down_multi_val(copy(), mv.release()); return manage(res); } -isl::set multi_aff::lex_gt_set(isl::multi_aff ma2) const +isl::multi_aff multi_aff::scale_down(isl::val v) const { - auto res = isl_multi_aff_lex_gt_set(copy(), ma2.release()); + auto res = isl_multi_aff_scale_down_val(copy(), v.release()); return manage(res); } -isl::set multi_aff::lex_le_set(isl::multi_aff ma2) const +isl::multi_aff multi_aff::scale_down(long v) const { - auto res = isl_multi_aff_lex_le_set(copy(), ma2.release()); - return manage(res); + return this->scale_down(isl::val(ctx(), v)); } -isl::set multi_aff::lex_lt_set(isl::multi_aff ma2) const +isl::multi_aff multi_aff::set_aff(int pos, isl::aff el) const { - auto res = isl_multi_aff_lex_lt_set(copy(), ma2.release()); + auto res = isl_multi_aff_set_aff(copy(), pos, el.release()); return manage(res); } -isl::multi_aff multi_aff::mod_multi_val(isl::multi_val mv) const +isl::multi_aff multi_aff::set_at(int pos, isl::aff el) const { - auto res = isl_multi_aff_mod_multi_val(copy(), mv.release()); + auto res = isl_multi_aff_set_at(copy(), pos, el.release()); return manage(res); } -isl::multi_aff multi_aff::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const +isl::multi_pw_aff multi_aff::set_at(int pos, const isl::pw_aff &el) const { - auto res = isl_multi_aff_move_dims(copy(), static_cast(dst_type), dst_pos, static_cast(src_type), src_pos, n); - return manage(res); + return isl::pw_multi_aff(*this).set_at(pos, el); } -isl::multi_aff multi_aff::multi_val_on_space(isl::space space, isl::multi_val mv) +isl::multi_union_pw_aff multi_aff::set_at(int pos, const isl::union_pw_aff &el) const { - auto res = isl_multi_aff_multi_val_on_space(space.release(), mv.release()); - return manage(res); + return isl::pw_multi_aff(*this).set_at(pos, el); } -isl::multi_aff multi_aff::neg() const +isl::multi_pw_aff multi_aff::set_pw_aff(int pos, const isl::pw_aff &el) const { - auto res = isl_multi_aff_neg(copy()); - return manage(res); + return isl::pw_multi_aff(*this).set_pw_aff(pos, el); } -int multi_aff::plain_cmp(const isl::multi_aff &multi2) const +isl::pw_multi_aff multi_aff::set_pw_aff(unsigned int pos, const isl::pw_aff &pa) const { - auto res = isl_multi_aff_plain_cmp(get(), multi2.get()); - return res; + return isl::pw_multi_aff(*this).set_pw_aff(pos, pa); } -boolean multi_aff::plain_is_equal(const isl::multi_aff &multi2) const +isl::multi_aff multi_aff::set_range_tuple(isl::id id) const { - auto res = isl_multi_aff_plain_is_equal(get(), multi2.get()); + auto res = isl_multi_aff_set_range_tuple_id(copy(), id.release()); return manage(res); } -isl::multi_aff multi_aff::product(isl::multi_aff multi2) const +isl::multi_aff multi_aff::set_range_tuple(const std::string &id) const { - auto res = isl_multi_aff_product(copy(), multi2.release()); - return manage(res); + return this->set_range_tuple(isl::id(ctx(), id)); } -isl::multi_aff multi_aff::project_domain_on_params() const +isl::multi_union_pw_aff multi_aff::set_union_pw_aff(int pos, const isl::union_pw_aff &el) const { - auto res = isl_multi_aff_project_domain_on_params(copy()); - return manage(res); + return isl::pw_multi_aff(*this).set_union_pw_aff(pos, el); } -isl::multi_aff multi_aff::project_out_map(isl::space space, isl::dim type, unsigned int first, unsigned int n) +class size multi_aff::size() const { - auto res = isl_multi_aff_project_out_map(space.release(), static_cast(type), first, n); + auto res = isl_multi_aff_size(get()); return manage(res); } -isl::multi_aff multi_aff::pullback(isl::multi_aff ma2) const +isl::space multi_aff::space() const { - auto res = isl_multi_aff_pullback_multi_aff(copy(), ma2.release()); + auto res = isl_multi_aff_get_space(get()); return manage(res); } -isl::multi_aff multi_aff::range_factor_domain() const +isl::space multi_aff::get_space() const { - auto res = isl_multi_aff_range_factor_domain(copy()); - return manage(res); + return space(); } -isl::multi_aff multi_aff::range_factor_range() const +isl::multi_aff multi_aff::sub(isl::multi_aff multi2) const { - auto res = isl_multi_aff_range_factor_range(copy()); + auto res = isl_multi_aff_sub(copy(), multi2.release()); return manage(res); } -boolean multi_aff::range_is_wrapping() const +isl::multi_pw_aff multi_aff::sub(const isl::multi_pw_aff &multi2) const { - auto res = isl_multi_aff_range_is_wrapping(get()); - return manage(res); + return isl::pw_multi_aff(*this).sub(multi2); } -isl::multi_aff multi_aff::range_map(isl::space space) +isl::multi_union_pw_aff multi_aff::sub(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_multi_aff_range_map(space.release()); - return manage(res); + return isl::pw_multi_aff(*this).sub(multi2); } -isl::multi_aff multi_aff::range_product(isl::multi_aff multi2) const +isl::pw_multi_aff multi_aff::sub(const isl::pw_multi_aff &pma2) const { - auto res = isl_multi_aff_range_product(copy(), multi2.release()); - return manage(res); + return isl::pw_multi_aff(*this).sub(pma2); } -isl::multi_aff multi_aff::range_splice(unsigned int pos, isl::multi_aff multi2) const +isl::union_pw_multi_aff multi_aff::sub(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_multi_aff_range_splice(copy(), pos, multi2.release()); - return manage(res); + return isl::pw_multi_aff(*this).sub(upma2); } -isl::multi_aff multi_aff::reset_tuple_id(isl::dim type) const +isl::multi_aff multi_aff::sub(const isl::aff &multi2) const { - auto res = isl_multi_aff_reset_tuple_id(copy(), static_cast(type)); - return manage(res); + return this->sub(isl::multi_aff(multi2)); } -isl::multi_aff multi_aff::reset_user() const +isl::pw_multi_aff multi_aff::subtract_domain(const isl::set &set) const { - auto res = isl_multi_aff_reset_user(copy()); - return manage(res); + return isl::pw_multi_aff(*this).subtract_domain(set); } -isl::multi_aff multi_aff::scale(isl::multi_val mv) const +isl::union_pw_multi_aff multi_aff::subtract_domain(const isl::space &space) const { - auto res = isl_multi_aff_scale_multi_val(copy(), mv.release()); - return manage(res); + return isl::pw_multi_aff(*this).subtract_domain(space); } -isl::multi_aff multi_aff::scale(isl::val v) const +isl::union_pw_multi_aff multi_aff::subtract_domain(const isl::union_set &uset) const { - auto res = isl_multi_aff_scale_val(copy(), v.release()); - return manage(res); + return isl::pw_multi_aff(*this).subtract_domain(uset); } -isl::multi_aff multi_aff::scale_down(isl::multi_val mv) const +isl::pw_multi_aff_list multi_aff::to_list() const { - auto res = isl_multi_aff_scale_down_multi_val(copy(), mv.release()); - return manage(res); + return isl::pw_multi_aff(*this).to_list(); } -isl::multi_aff multi_aff::scale_down(isl::val v) const +isl::multi_pw_aff multi_aff::to_multi_pw_aff() const { - auto res = isl_multi_aff_scale_down_val(copy(), v.release()); + auto res = isl_multi_aff_to_multi_pw_aff(copy()); return manage(res); } -isl::multi_aff multi_aff::set_aff(int pos, isl::aff el) const +isl::multi_union_pw_aff multi_aff::to_multi_union_pw_aff() const { - auto res = isl_multi_aff_set_aff(copy(), pos, el.release()); + auto res = isl_multi_aff_to_multi_union_pw_aff(copy()); return manage(res); } -isl::multi_aff multi_aff::set_at(int pos, isl::aff el) const +isl::pw_multi_aff multi_aff::to_pw_multi_aff() const { - auto res = isl_multi_aff_set_at(copy(), pos, el.release()); + auto res = isl_multi_aff_to_pw_multi_aff(copy()); return manage(res); } -isl::multi_aff multi_aff::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const +isl::union_pw_multi_aff multi_aff::to_union_pw_multi_aff() const { - auto res = isl_multi_aff_set_dim_id(copy(), static_cast(type), pos, id.release()); - return manage(res); + return isl::pw_multi_aff(*this).to_union_pw_multi_aff(); } -isl::multi_aff multi_aff::set_tuple_id(isl::dim type, isl::id id) const +isl::id multi_aff::tuple_id(isl::dim type) const { - auto res = isl_multi_aff_set_tuple_id(copy(), static_cast(type), id.release()); - return manage(res); + return isl::pw_multi_aff(*this).tuple_id(type); } -isl::multi_aff multi_aff::set_tuple_name(isl::dim type, const std::string &s) const +isl::multi_aff multi_aff::unbind_params_insert_domain(isl::multi_id domain) const { - auto res = isl_multi_aff_set_tuple_name(copy(), static_cast(type), s.c_str()); + auto res = isl_multi_aff_unbind_params_insert_domain(copy(), domain.release()); return manage(res); } -isl_size multi_aff::size() const +isl::multi_pw_aff multi_aff::union_add(const isl::multi_pw_aff &mpa2) const { - auto res = isl_multi_aff_size(get()); - return res; + return isl::pw_multi_aff(*this).union_add(mpa2); } -isl::multi_aff multi_aff::splice(unsigned int in_pos, unsigned int out_pos, isl::multi_aff multi2) const +isl::multi_union_pw_aff multi_aff::union_add(const isl::multi_union_pw_aff &mupa2) const { - auto res = isl_multi_aff_splice(copy(), in_pos, out_pos, multi2.release()); - return manage(res); + return isl::pw_multi_aff(*this).union_add(mupa2); } -isl::multi_aff multi_aff::sub(isl::multi_aff multi2) const +isl::pw_multi_aff multi_aff::union_add(const isl::pw_multi_aff &pma2) const { - auto res = isl_multi_aff_sub(copy(), multi2.release()); - return manage(res); + return isl::pw_multi_aff(*this).union_add(pma2); } -isl::multi_aff multi_aff::unbind_params_insert_domain(isl::multi_id domain) const +isl::union_pw_multi_aff multi_aff::union_add(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_multi_aff_unbind_params_insert_domain(copy(), domain.release()); - return manage(res); + return isl::pw_multi_aff(*this).union_add(upma2); } isl::multi_aff multi_aff::zero(isl::space space) @@ -10285,6 +12775,18 @@ isl::multi_aff multi_aff::zero(isl::space space) return manage(res); } +inline std::ostream &operator<<(std::ostream &os, const multi_aff &obj) +{ + char *str = isl_multi_aff_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} + // implementations for isl::multi_id multi_id manage(__isl_take isl_multi_id *ptr) { return multi_id(ptr); @@ -10303,7 +12805,6 @@ multi_id::multi_id(const multi_id &obj) ptr = obj.copy(); } - multi_id::multi_id(__isl_take isl_multi_id *ptr) : ptr(ptr) {} @@ -10312,6 +12813,7 @@ multi_id::multi_id(isl::space space, isl::id_list list) auto res = isl_multi_id_from_id_list(space.release(), list.release()); ptr = res; } + multi_id::multi_id(isl::ctx ctx, const std::string &str) { auto res = isl_multi_id_read_from_str(ctx.release(), str.c_str()); @@ -10346,26 +12848,19 @@ bool multi_id::is_null() const { return ptr == nullptr; } - isl::ctx multi_id::ctx() const { return isl::ctx(isl_multi_id_get_ctx(ptr)); } -void multi_id::dump() const { - isl_multi_id_dump(get()); -} - - -isl::multi_id multi_id::align_params(isl::space model) const +isl::id multi_id::at(int pos) const { - auto res = isl_multi_id_align_params(copy(), model.release()); + auto res = isl_multi_id_get_at(get(), pos); return manage(res); } -isl::multi_id multi_id::factor_range() const +isl::id multi_id::get_at(int pos) const { - auto res = isl_multi_id_factor_range(copy()); - return manage(res); + return at(pos); } isl::multi_id multi_id::flat_range_product(isl::multi_id multi2) const @@ -10374,46 +12869,15 @@ isl::multi_id multi_id::flat_range_product(isl::multi_id multi2) const return manage(res); } -isl::multi_id multi_id::flatten_range() const -{ - auto res = isl_multi_id_flatten_range(copy()); - return manage(res); -} - -isl::multi_id multi_id::from_range() const -{ - auto res = isl_multi_id_from_range(copy()); - return manage(res); -} - -isl::id multi_id::get_at(int pos) const -{ - auto res = isl_multi_id_get_at(get(), pos); - return manage(res); -} - -isl::space multi_id::get_domain_space() const -{ - auto res = isl_multi_id_get_domain_space(get()); - return manage(res); -} - -isl::id multi_id::get_id(int pos) const -{ - auto res = isl_multi_id_get_id(get(), pos); - return manage(res); -} - -isl::id_list multi_id::get_list() const +isl::id_list multi_id::list() const { auto res = isl_multi_id_get_list(get()); return manage(res); } -isl::space multi_id::get_space() const +isl::id_list multi_id::get_list() const { - auto res = isl_multi_id_get_space(get()); - return manage(res); + return list(); } boolean multi_id::plain_is_equal(const isl::multi_id &multi2) const @@ -10422,58 +12886,61 @@ boolean multi_id::plain_is_equal(const isl::multi_id &multi2) const return manage(res); } -isl::multi_id multi_id::range_factor_domain() const +isl::multi_id multi_id::range_product(isl::multi_id multi2) const { - auto res = isl_multi_id_range_factor_domain(copy()); + auto res = isl_multi_id_range_product(copy(), multi2.release()); return manage(res); } -isl::multi_id multi_id::range_factor_range() const +isl::multi_id multi_id::set_at(int pos, isl::id el) const { - auto res = isl_multi_id_range_factor_range(copy()); + auto res = isl_multi_id_set_at(copy(), pos, el.release()); return manage(res); } -boolean multi_id::range_is_wrapping() const +isl::multi_id multi_id::set_at(int pos, const std::string &el) const { - auto res = isl_multi_id_range_is_wrapping(get()); - return manage(res); + return this->set_at(pos, isl::id(ctx(), el)); } -isl::multi_id multi_id::range_product(isl::multi_id multi2) const +isl::multi_id multi_id::set_id(int pos, isl::id el) const { - auto res = isl_multi_id_range_product(copy(), multi2.release()); + auto res = isl_multi_id_set_id(copy(), pos, el.release()); return manage(res); } -isl::multi_id multi_id::range_splice(unsigned int pos, isl::multi_id multi2) const +isl::multi_id multi_id::set_id(int pos, const std::string &el) const { - auto res = isl_multi_id_range_splice(copy(), pos, multi2.release()); - return manage(res); + return this->set_id(pos, isl::id(ctx(), el)); } -isl::multi_id multi_id::reset_user() const +class size multi_id::size() const { - auto res = isl_multi_id_reset_user(copy()); + auto res = isl_multi_id_size(get()); return manage(res); } -isl::multi_id multi_id::set_at(int pos, isl::id el) const +isl::space multi_id::space() const { - auto res = isl_multi_id_set_at(copy(), pos, el.release()); + auto res = isl_multi_id_get_space(get()); return manage(res); } -isl::multi_id multi_id::set_id(int pos, isl::id el) const +isl::space multi_id::get_space() const { - auto res = isl_multi_id_set_id(copy(), pos, el.release()); - return manage(res); + return space(); } -isl_size multi_id::size() const +inline std::ostream &operator<<(std::ostream &os, const multi_id &obj) { - auto res = isl_multi_id_size(get()); - return res; + char *str = isl_multi_id_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } // implementations for isl::multi_pw_aff @@ -10494,7 +12961,6 @@ multi_pw_aff::multi_pw_aff(const multi_pw_aff &obj) ptr = obj.copy(); } - multi_pw_aff::multi_pw_aff(__isl_take isl_multi_pw_aff *ptr) : ptr(ptr) {} @@ -10503,26 +12969,31 @@ multi_pw_aff::multi_pw_aff(isl::aff aff) auto res = isl_multi_pw_aff_from_aff(aff.release()); ptr = res; } + multi_pw_aff::multi_pw_aff(isl::multi_aff ma) { auto res = isl_multi_pw_aff_from_multi_aff(ma.release()); ptr = res; } + multi_pw_aff::multi_pw_aff(isl::pw_aff pa) { auto res = isl_multi_pw_aff_from_pw_aff(pa.release()); ptr = res; } + multi_pw_aff::multi_pw_aff(isl::space space, isl::pw_aff_list list) { auto res = isl_multi_pw_aff_from_pw_aff_list(space.release(), list.release()); ptr = res; } + multi_pw_aff::multi_pw_aff(isl::pw_multi_aff pma) { auto res = isl_multi_pw_aff_from_pw_multi_aff(pma.release()); ptr = res; } + multi_pw_aff::multi_pw_aff(isl::ctx ctx, const std::string &str) { auto res = isl_multi_pw_aff_read_from_str(ctx.release(), str.c_str()); @@ -10557,200 +13028,178 @@ bool multi_pw_aff::is_null() const { return ptr == nullptr; } - isl::ctx multi_pw_aff::ctx() const { return isl::ctx(isl_multi_pw_aff_get_ctx(ptr)); } -void multi_pw_aff::dump() const { - isl_multi_pw_aff_dump(get()); -} - - isl::multi_pw_aff multi_pw_aff::add(isl::multi_pw_aff multi2) const { auto res = isl_multi_pw_aff_add(copy(), multi2.release()); return manage(res); } -isl::multi_pw_aff multi_pw_aff::add_constant(isl::multi_val mv) const +isl::multi_union_pw_aff multi_pw_aff::add(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_multi_pw_aff_add_constant_multi_val(copy(), mv.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).add(multi2); } -isl::multi_pw_aff multi_pw_aff::add_constant(isl::val v) const +isl::multi_pw_aff multi_pw_aff::add(const isl::aff &multi2) const { - auto res = isl_multi_pw_aff_add_constant_val(copy(), v.release()); - return manage(res); + return this->add(isl::multi_pw_aff(multi2)); } -isl::multi_pw_aff multi_pw_aff::add_dims(isl::dim type, unsigned int n) const +isl::multi_pw_aff multi_pw_aff::add(const isl::multi_aff &multi2) const { - auto res = isl_multi_pw_aff_add_dims(copy(), static_cast(type), n); - return manage(res); + return this->add(isl::multi_pw_aff(multi2)); } -isl::multi_pw_aff multi_pw_aff::align_params(isl::space model) const +isl::multi_pw_aff multi_pw_aff::add(const isl::pw_aff &multi2) const { - auto res = isl_multi_pw_aff_align_params(copy(), model.release()); - return manage(res); + return this->add(isl::multi_pw_aff(multi2)); } -isl::set multi_pw_aff::bind(isl::multi_id tuple) const +isl::multi_pw_aff multi_pw_aff::add(const isl::pw_multi_aff &multi2) const { - auto res = isl_multi_pw_aff_bind(copy(), tuple.release()); - return manage(res); + return this->add(isl::multi_pw_aff(multi2)); } -isl::multi_pw_aff multi_pw_aff::bind_domain(isl::multi_id tuple) const +isl::multi_pw_aff multi_pw_aff::add_constant(isl::multi_val mv) const { - auto res = isl_multi_pw_aff_bind_domain(copy(), tuple.release()); + auto res = isl_multi_pw_aff_add_constant_multi_val(copy(), mv.release()); return manage(res); } -isl::multi_pw_aff multi_pw_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const +isl::multi_pw_aff multi_pw_aff::add_constant(isl::val v) const { - auto res = isl_multi_pw_aff_bind_domain_wrapped_domain(copy(), tuple.release()); + auto res = isl_multi_pw_aff_add_constant_val(copy(), v.release()); return manage(res); } -isl::multi_pw_aff multi_pw_aff::coalesce() const +isl::multi_pw_aff multi_pw_aff::add_constant(long v) const { - auto res = isl_multi_pw_aff_coalesce(copy()); - return manage(res); + return this->add_constant(isl::val(ctx(), v)); } -isl_size multi_pw_aff::dim(isl::dim type) const +isl::map multi_pw_aff::as_map() const { - auto res = isl_multi_pw_aff_dim(get(), static_cast(type)); - return res; + auto res = isl_multi_pw_aff_as_map(copy()); + return manage(res); } -isl::set multi_pw_aff::domain() const +isl::multi_aff multi_pw_aff::as_multi_aff() const { - auto res = isl_multi_pw_aff_domain(copy()); + auto res = isl_multi_pw_aff_as_multi_aff(copy()); return manage(res); } -isl::multi_pw_aff multi_pw_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::set multi_pw_aff::as_set() const { - auto res = isl_multi_pw_aff_drop_dims(copy(), static_cast(type), first, n); + auto res = isl_multi_pw_aff_as_set(copy()); return manage(res); } -isl::map multi_pw_aff::eq_map(isl::multi_pw_aff mpa2) const +isl::pw_aff multi_pw_aff::at(int pos) const { - auto res = isl_multi_pw_aff_eq_map(copy(), mpa2.release()); + auto res = isl_multi_pw_aff_get_at(get(), pos); return manage(res); } -isl::multi_pw_aff multi_pw_aff::factor_range() const +isl::pw_aff multi_pw_aff::get_at(int pos) const { - auto res = isl_multi_pw_aff_factor_range(copy()); - return manage(res); + return at(pos); } -int multi_pw_aff::find_dim_by_id(isl::dim type, const isl::id &id) const +isl::set multi_pw_aff::bind(isl::multi_id tuple) const { - auto res = isl_multi_pw_aff_find_dim_by_id(get(), static_cast(type), id.get()); - return res; + auto res = isl_multi_pw_aff_bind(copy(), tuple.release()); + return manage(res); } -int multi_pw_aff::find_dim_by_name(isl::dim type, const std::string &name) const +isl::multi_pw_aff multi_pw_aff::bind_domain(isl::multi_id tuple) const { - auto res = isl_multi_pw_aff_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + auto res = isl_multi_pw_aff_bind_domain(copy(), tuple.release()); + return manage(res); } -isl::multi_pw_aff multi_pw_aff::flat_range_product(isl::multi_pw_aff multi2) const +isl::multi_pw_aff multi_pw_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const { - auto res = isl_multi_pw_aff_flat_range_product(copy(), multi2.release()); + auto res = isl_multi_pw_aff_bind_domain_wrapped_domain(copy(), tuple.release()); return manage(res); } -isl::multi_pw_aff multi_pw_aff::flatten_range() const +isl::multi_pw_aff multi_pw_aff::coalesce() const { - auto res = isl_multi_pw_aff_flatten_range(copy()); + auto res = isl_multi_pw_aff_coalesce(copy()); return manage(res); } -isl::multi_pw_aff multi_pw_aff::from_range() const +class size multi_pw_aff::dim(isl::dim type) const { - auto res = isl_multi_pw_aff_from_range(copy()); + auto res = isl_multi_pw_aff_dim(get(), static_cast(type)); return manage(res); } -isl::pw_aff multi_pw_aff::get_at(int pos) const +isl::set multi_pw_aff::domain() const { - auto res = isl_multi_pw_aff_get_at(get(), pos); + auto res = isl_multi_pw_aff_domain(copy()); return manage(res); } -isl::id multi_pw_aff::get_dim_id(isl::dim type, unsigned int pos) const +isl::multi_pw_aff multi_pw_aff::flat_range_product(isl::multi_pw_aff multi2) const { - auto res = isl_multi_pw_aff_get_dim_id(get(), static_cast(type), pos); + auto res = isl_multi_pw_aff_flat_range_product(copy(), multi2.release()); return manage(res); } -isl::space multi_pw_aff::get_domain_space() const +isl::multi_union_pw_aff multi_pw_aff::flat_range_product(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_multi_pw_aff_get_domain_space(get()); - return manage(res); + return isl::multi_union_pw_aff(*this).flat_range_product(multi2); } -uint32_t multi_pw_aff::get_hash() const +isl::multi_pw_aff multi_pw_aff::flat_range_product(const isl::aff &multi2) const { - auto res = isl_multi_pw_aff_get_hash(get()); - return res; + return this->flat_range_product(isl::multi_pw_aff(multi2)); } -isl::pw_aff_list multi_pw_aff::get_list() const +isl::multi_pw_aff multi_pw_aff::flat_range_product(const isl::multi_aff &multi2) const { - auto res = isl_multi_pw_aff_get_list(get()); - return manage(res); + return this->flat_range_product(isl::multi_pw_aff(multi2)); } -isl::pw_aff multi_pw_aff::get_pw_aff(int pos) const +isl::multi_pw_aff multi_pw_aff::flat_range_product(const isl::pw_aff &multi2) const { - auto res = isl_multi_pw_aff_get_pw_aff(get(), pos); - return manage(res); + return this->flat_range_product(isl::multi_pw_aff(multi2)); } -isl::space multi_pw_aff::get_space() const +isl::multi_pw_aff multi_pw_aff::flat_range_product(const isl::pw_multi_aff &multi2) const { - auto res = isl_multi_pw_aff_get_space(get()); - return manage(res); + return this->flat_range_product(isl::multi_pw_aff(multi2)); } -isl::id multi_pw_aff::get_tuple_id(isl::dim type) const +isl::multi_pw_aff multi_pw_aff::gist(isl::set set) const { - auto res = isl_multi_pw_aff_get_tuple_id(get(), static_cast(type)); + auto res = isl_multi_pw_aff_gist(copy(), set.release()); return manage(res); } -std::string multi_pw_aff::get_tuple_name(isl::dim type) const +isl::multi_union_pw_aff multi_pw_aff::gist(const isl::union_set &context) const { - auto res = isl_multi_pw_aff_get_tuple_name(get(), static_cast(type)); - std::string tmp(res); - return tmp; + return isl::multi_union_pw_aff(*this).gist(context); } -isl::multi_pw_aff multi_pw_aff::gist(isl::set set) const +isl::multi_pw_aff multi_pw_aff::gist(const isl::basic_set &set) const { - auto res = isl_multi_pw_aff_gist(copy(), set.release()); - return manage(res); + return this->gist(isl::set(set)); } -isl::multi_pw_aff multi_pw_aff::gist_params(isl::set set) const +isl::multi_pw_aff multi_pw_aff::gist(const isl::point &set) const { - auto res = isl_multi_pw_aff_gist_params(copy(), set.release()); - return manage(res); + return this->gist(isl::set(set)); } -boolean multi_pw_aff::has_tuple_id(isl::dim type) const +boolean multi_pw_aff::has_range_tuple_id() const { - auto res = isl_multi_pw_aff_has_tuple_id(get(), static_cast(type)); + auto res = isl_multi_pw_aff_has_range_tuple_id(get()); return manage(res); } @@ -10772,12 +13221,6 @@ isl::multi_pw_aff multi_pw_aff::identity_on_domain(isl::space space) return manage(res); } -isl::multi_pw_aff multi_pw_aff::insert_dims(isl::dim type, unsigned int first, unsigned int n) const -{ - auto res = isl_multi_pw_aff_insert_dims(copy(), static_cast(type), first, n); - return manage(res); -} - isl::multi_pw_aff multi_pw_aff::insert_domain(isl::space domain) const { auto res = isl_multi_pw_aff_insert_domain(copy(), domain.release()); @@ -10790,70 +13233,65 @@ isl::multi_pw_aff multi_pw_aff::intersect_domain(isl::set domain) const return manage(res); } -isl::multi_pw_aff multi_pw_aff::intersect_params(isl::set set) const +isl::multi_union_pw_aff multi_pw_aff::intersect_domain(const isl::union_set &uset) const { - auto res = isl_multi_pw_aff_intersect_params(copy(), set.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).intersect_domain(uset); } -boolean multi_pw_aff::involves_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::multi_pw_aff multi_pw_aff::intersect_domain(const isl::basic_set &domain) const { - auto res = isl_multi_pw_aff_involves_dims(get(), static_cast(type), first, n); - return manage(res); + return this->intersect_domain(isl::set(domain)); } -boolean multi_pw_aff::involves_nan() const +isl::multi_pw_aff multi_pw_aff::intersect_domain(const isl::point &domain) const { - auto res = isl_multi_pw_aff_involves_nan(get()); - return manage(res); + return this->intersect_domain(isl::set(domain)); } -boolean multi_pw_aff::involves_param(const isl::id &id) const +isl::multi_pw_aff multi_pw_aff::intersect_params(isl::set set) const { - auto res = isl_multi_pw_aff_involves_param_id(get(), id.get()); + auto res = isl_multi_pw_aff_intersect_params(copy(), set.release()); return manage(res); } -boolean multi_pw_aff::involves_param(const isl::id_list &list) const +boolean multi_pw_aff::involves_nan() const { - auto res = isl_multi_pw_aff_involves_param_id_list(get(), list.get()); + auto res = isl_multi_pw_aff_involves_nan(get()); return manage(res); } -boolean multi_pw_aff::is_cst() const +boolean multi_pw_aff::involves_param(const isl::id &id) const { - auto res = isl_multi_pw_aff_is_cst(get()); + auto res = isl_multi_pw_aff_involves_param_id(get(), id.get()); return manage(res); } -boolean multi_pw_aff::is_equal(const isl::multi_pw_aff &mpa2) const +boolean multi_pw_aff::involves_param(const std::string &id) const { - auto res = isl_multi_pw_aff_is_equal(get(), mpa2.get()); - return manage(res); + return this->involves_param(isl::id(ctx(), id)); } -isl::map multi_pw_aff::lex_ge_map(isl::multi_pw_aff mpa2) const +boolean multi_pw_aff::involves_param(const isl::id_list &list) const { - auto res = isl_multi_pw_aff_lex_ge_map(copy(), mpa2.release()); + auto res = isl_multi_pw_aff_involves_param_id_list(get(), list.get()); return manage(res); } -isl::map multi_pw_aff::lex_gt_map(isl::multi_pw_aff mpa2) const +boolean multi_pw_aff::isa_multi_aff() const { - auto res = isl_multi_pw_aff_lex_gt_map(copy(), mpa2.release()); + auto res = isl_multi_pw_aff_isa_multi_aff(get()); return manage(res); } -isl::map multi_pw_aff::lex_le_map(isl::multi_pw_aff mpa2) const +isl::pw_aff_list multi_pw_aff::list() const { - auto res = isl_multi_pw_aff_lex_le_map(copy(), mpa2.release()); + auto res = isl_multi_pw_aff_get_list(get()); return manage(res); } -isl::map multi_pw_aff::lex_lt_map(isl::multi_pw_aff mpa2) const +isl::pw_aff_list multi_pw_aff::get_list() const { - auto res = isl_multi_pw_aff_lex_lt_map(copy(), mpa2.release()); - return manage(res); + return list(); } isl::multi_pw_aff multi_pw_aff::max(isl::multi_pw_aff multi2) const @@ -10880,39 +13318,46 @@ isl::multi_val multi_pw_aff::min_multi_val() const return manage(res); } -isl::multi_pw_aff multi_pw_aff::mod_multi_val(isl::multi_val mv) const +isl::multi_pw_aff multi_pw_aff::neg() const { - auto res = isl_multi_pw_aff_mod_multi_val(copy(), mv.release()); + auto res = isl_multi_pw_aff_neg(copy()); return manage(res); } -isl::multi_pw_aff multi_pw_aff::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const +boolean multi_pw_aff::plain_is_equal(const isl::multi_pw_aff &multi2) const { - auto res = isl_multi_pw_aff_move_dims(copy(), static_cast(dst_type), dst_pos, static_cast(src_type), src_pos, n); + auto res = isl_multi_pw_aff_plain_is_equal(get(), multi2.get()); return manage(res); } -isl::multi_pw_aff multi_pw_aff::neg() const +boolean multi_pw_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_multi_pw_aff_neg(copy()); - return manage(res); + return isl::multi_union_pw_aff(*this).plain_is_equal(multi2); } -boolean multi_pw_aff::plain_is_equal(const isl::multi_pw_aff &multi2) const +boolean multi_pw_aff::plain_is_equal(const isl::aff &multi2) const { - auto res = isl_multi_pw_aff_plain_is_equal(get(), multi2.get()); - return manage(res); + return this->plain_is_equal(isl::multi_pw_aff(multi2)); } -isl::multi_pw_aff multi_pw_aff::product(isl::multi_pw_aff multi2) const +boolean multi_pw_aff::plain_is_equal(const isl::multi_aff &multi2) const { - auto res = isl_multi_pw_aff_product(copy(), multi2.release()); - return manage(res); + return this->plain_is_equal(isl::multi_pw_aff(multi2)); +} + +boolean multi_pw_aff::plain_is_equal(const isl::pw_aff &multi2) const +{ + return this->plain_is_equal(isl::multi_pw_aff(multi2)); +} + +boolean multi_pw_aff::plain_is_equal(const isl::pw_multi_aff &multi2) const +{ + return this->plain_is_equal(isl::multi_pw_aff(multi2)); } -isl::multi_pw_aff multi_pw_aff::project_domain_on_params() const +isl::multi_pw_aff multi_pw_aff::product(isl::multi_pw_aff multi2) const { - auto res = isl_multi_pw_aff_project_domain_on_params(copy()); + auto res = isl_multi_pw_aff_product(copy(), multi2.release()); return manage(res); } @@ -10934,33 +13379,56 @@ isl::multi_pw_aff multi_pw_aff::pullback(isl::pw_multi_aff pma) const return manage(res); } -isl::multi_pw_aff multi_pw_aff::range_factor_domain() const +isl::multi_union_pw_aff multi_pw_aff::pullback(const isl::union_pw_multi_aff &upma) const { - auto res = isl_multi_pw_aff_range_factor_domain(copy()); - return manage(res); + return isl::multi_union_pw_aff(*this).pullback(upma); } -isl::multi_pw_aff multi_pw_aff::range_factor_range() const +isl::multi_pw_aff multi_pw_aff::range_product(isl::multi_pw_aff multi2) const { - auto res = isl_multi_pw_aff_range_factor_range(copy()); + auto res = isl_multi_pw_aff_range_product(copy(), multi2.release()); return manage(res); } -boolean multi_pw_aff::range_is_wrapping() const +isl::multi_union_pw_aff multi_pw_aff::range_product(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_multi_pw_aff_range_is_wrapping(get()); - return manage(res); + return isl::multi_union_pw_aff(*this).range_product(multi2); } -isl::multi_pw_aff multi_pw_aff::range_product(isl::multi_pw_aff multi2) const +isl::multi_pw_aff multi_pw_aff::range_product(const isl::aff &multi2) const { - auto res = isl_multi_pw_aff_range_product(copy(), multi2.release()); + return this->range_product(isl::multi_pw_aff(multi2)); +} + +isl::multi_pw_aff multi_pw_aff::range_product(const isl::multi_aff &multi2) const +{ + return this->range_product(isl::multi_pw_aff(multi2)); +} + +isl::multi_pw_aff multi_pw_aff::range_product(const isl::pw_aff &multi2) const +{ + return this->range_product(isl::multi_pw_aff(multi2)); +} + +isl::multi_pw_aff multi_pw_aff::range_product(const isl::pw_multi_aff &multi2) const +{ + return this->range_product(isl::multi_pw_aff(multi2)); +} + +isl::id multi_pw_aff::range_tuple_id() const +{ + auto res = isl_multi_pw_aff_get_range_tuple_id(get()); return manage(res); } -isl::multi_pw_aff multi_pw_aff::range_splice(unsigned int pos, isl::multi_pw_aff multi2) const +isl::id multi_pw_aff::get_range_tuple_id() const +{ + return range_tuple_id(); +} + +isl::multi_pw_aff multi_pw_aff::reset_range_tuple_id() const { - auto res = isl_multi_pw_aff_range_splice(copy(), pos, multi2.release()); + auto res = isl_multi_pw_aff_reset_range_tuple_id(copy()); return manage(res); } @@ -10970,12 +13438,6 @@ isl::multi_pw_aff multi_pw_aff::reset_tuple_id(isl::dim type) const return manage(res); } -isl::multi_pw_aff multi_pw_aff::reset_user() const -{ - auto res = isl_multi_pw_aff_reset_user(copy()); - return manage(res); -} - isl::multi_pw_aff multi_pw_aff::scale(isl::multi_val mv) const { auto res = isl_multi_pw_aff_scale_multi_val(copy(), mv.release()); @@ -10988,6 +13450,11 @@ isl::multi_pw_aff multi_pw_aff::scale(isl::val v) const return manage(res); } +isl::multi_pw_aff multi_pw_aff::scale(long v) const +{ + return this->scale(isl::val(ctx(), v)); +} + isl::multi_pw_aff multi_pw_aff::scale_down(isl::multi_val mv) const { auto res = isl_multi_pw_aff_scale_down_multi_val(copy(), mv.release()); @@ -11000,16 +13467,20 @@ isl::multi_pw_aff multi_pw_aff::scale_down(isl::val v) const return manage(res); } +isl::multi_pw_aff multi_pw_aff::scale_down(long v) const +{ + return this->scale_down(isl::val(ctx(), v)); +} + isl::multi_pw_aff multi_pw_aff::set_at(int pos, isl::pw_aff el) const { auto res = isl_multi_pw_aff_set_at(copy(), pos, el.release()); return manage(res); } -isl::multi_pw_aff multi_pw_aff::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const +isl::multi_union_pw_aff multi_pw_aff::set_at(int pos, const isl::union_pw_aff &el) const { - auto res = isl_multi_pw_aff_set_dim_id(copy(), static_cast(type), pos, id.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).set_at(pos, el); } isl::multi_pw_aff multi_pw_aff::set_pw_aff(int pos, isl::pw_aff el) const @@ -11018,36 +13489,70 @@ isl::multi_pw_aff multi_pw_aff::set_pw_aff(int pos, isl::pw_aff el) const return manage(res); } -isl::multi_pw_aff multi_pw_aff::set_tuple_id(isl::dim type, isl::id id) const +isl::multi_pw_aff multi_pw_aff::set_range_tuple(isl::id id) const { - auto res = isl_multi_pw_aff_set_tuple_id(copy(), static_cast(type), id.release()); + auto res = isl_multi_pw_aff_set_range_tuple_id(copy(), id.release()); return manage(res); } -isl::multi_pw_aff multi_pw_aff::set_tuple_name(isl::dim type, const std::string &s) const +isl::multi_pw_aff multi_pw_aff::set_range_tuple(const std::string &id) const { - auto res = isl_multi_pw_aff_set_tuple_name(copy(), static_cast(type), s.c_str()); - return manage(res); + return this->set_range_tuple(isl::id(ctx(), id)); } -isl_size multi_pw_aff::size() const +isl::multi_union_pw_aff multi_pw_aff::set_union_pw_aff(int pos, const isl::union_pw_aff &el) const +{ + return isl::multi_union_pw_aff(*this).set_union_pw_aff(pos, el); +} + +class size multi_pw_aff::size() const { auto res = isl_multi_pw_aff_size(get()); - return res; + return manage(res); } -isl::multi_pw_aff multi_pw_aff::splice(unsigned int in_pos, unsigned int out_pos, isl::multi_pw_aff multi2) const +isl::space multi_pw_aff::space() const { - auto res = isl_multi_pw_aff_splice(copy(), in_pos, out_pos, multi2.release()); + auto res = isl_multi_pw_aff_get_space(get()); return manage(res); } +isl::space multi_pw_aff::get_space() const +{ + return space(); +} + isl::multi_pw_aff multi_pw_aff::sub(isl::multi_pw_aff multi2) const { auto res = isl_multi_pw_aff_sub(copy(), multi2.release()); return manage(res); } +isl::multi_union_pw_aff multi_pw_aff::sub(const isl::multi_union_pw_aff &multi2) const +{ + return isl::multi_union_pw_aff(*this).sub(multi2); +} + +isl::multi_pw_aff multi_pw_aff::sub(const isl::aff &multi2) const +{ + return this->sub(isl::multi_pw_aff(multi2)); +} + +isl::multi_pw_aff multi_pw_aff::sub(const isl::multi_aff &multi2) const +{ + return this->sub(isl::multi_pw_aff(multi2)); +} + +isl::multi_pw_aff multi_pw_aff::sub(const isl::pw_aff &multi2) const +{ + return this->sub(isl::multi_pw_aff(multi2)); +} + +isl::multi_pw_aff multi_pw_aff::sub(const isl::pw_multi_aff &multi2) const +{ + return this->sub(isl::multi_pw_aff(multi2)); +} + isl::multi_pw_aff multi_pw_aff::unbind_params_insert_domain(isl::multi_id domain) const { auto res = isl_multi_pw_aff_unbind_params_insert_domain(copy(), domain.release()); @@ -11060,12 +13565,49 @@ isl::multi_pw_aff multi_pw_aff::union_add(isl::multi_pw_aff mpa2) const return manage(res); } +isl::multi_union_pw_aff multi_pw_aff::union_add(const isl::multi_union_pw_aff &mupa2) const +{ + return isl::multi_union_pw_aff(*this).union_add(mupa2); +} + +isl::multi_pw_aff multi_pw_aff::union_add(const isl::aff &mpa2) const +{ + return this->union_add(isl::multi_pw_aff(mpa2)); +} + +isl::multi_pw_aff multi_pw_aff::union_add(const isl::multi_aff &mpa2) const +{ + return this->union_add(isl::multi_pw_aff(mpa2)); +} + +isl::multi_pw_aff multi_pw_aff::union_add(const isl::pw_aff &mpa2) const +{ + return this->union_add(isl::multi_pw_aff(mpa2)); +} + +isl::multi_pw_aff multi_pw_aff::union_add(const isl::pw_multi_aff &mpa2) const +{ + return this->union_add(isl::multi_pw_aff(mpa2)); +} + isl::multi_pw_aff multi_pw_aff::zero(isl::space space) { auto res = isl_multi_pw_aff_zero(space.release()); return manage(res); } +inline std::ostream &operator<<(std::ostream &os, const multi_pw_aff &obj) +{ + char *str = isl_multi_pw_aff_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} + // implementations for isl::multi_union_pw_aff multi_union_pw_aff manage(__isl_take isl_multi_union_pw_aff *ptr) { return multi_union_pw_aff(ptr); @@ -11084,7 +13626,6 @@ multi_union_pw_aff::multi_union_pw_aff(const multi_union_pw_aff &obj) ptr = obj.copy(); } - multi_union_pw_aff::multi_union_pw_aff(__isl_take isl_multi_union_pw_aff *ptr) : ptr(ptr) {} @@ -11093,21 +13634,25 @@ multi_union_pw_aff::multi_union_pw_aff(isl::multi_pw_aff mpa) auto res = isl_multi_union_pw_aff_from_multi_pw_aff(mpa.release()); ptr = res; } + multi_union_pw_aff::multi_union_pw_aff(isl::union_pw_aff upa) { auto res = isl_multi_union_pw_aff_from_union_pw_aff(upa.release()); ptr = res; } + multi_union_pw_aff::multi_union_pw_aff(isl::space space, isl::union_pw_aff_list list) { auto res = isl_multi_union_pw_aff_from_union_pw_aff_list(space.release(), list.release()); ptr = res; } + multi_union_pw_aff::multi_union_pw_aff(isl::union_pw_multi_aff upma) { auto res = isl_multi_union_pw_aff_from_union_pw_multi_aff(upma.release()); ptr = res; } + multi_union_pw_aff::multi_union_pw_aff(isl::ctx ctx, const std::string &str) { auto res = isl_multi_union_pw_aff_read_from_str(ctx.release(), str.c_str()); @@ -11142,3524 +13687,3393 @@ bool multi_union_pw_aff::is_null() const { return ptr == nullptr; } - isl::ctx multi_union_pw_aff::ctx() const { return isl::ctx(isl_multi_union_pw_aff_get_ctx(ptr)); } -void multi_union_pw_aff::dump() const { - isl_multi_union_pw_aff_dump(get()); +isl::multi_union_pw_aff multi_union_pw_aff::add(isl::multi_union_pw_aff multi2) const +{ + auto res = isl_multi_union_pw_aff_add(copy(), multi2.release()); + return manage(res); +} + +isl::union_pw_aff multi_union_pw_aff::at(int pos) const +{ + auto res = isl_multi_union_pw_aff_get_at(get(), pos); + return manage(res); } +isl::union_pw_aff multi_union_pw_aff::get_at(int pos) const +{ + return at(pos); +} -isl::multi_union_pw_aff multi_union_pw_aff::add(isl::multi_union_pw_aff multi2) const +isl::union_set multi_union_pw_aff::bind(isl::multi_id tuple) const { - auto res = isl_multi_union_pw_aff_add(copy(), multi2.release()); + auto res = isl_multi_union_pw_aff_bind(copy(), tuple.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::align_params(isl::space model) const +isl::multi_union_pw_aff multi_union_pw_aff::coalesce() const { - auto res = isl_multi_union_pw_aff_align_params(copy(), model.release()); + auto res = isl_multi_union_pw_aff_coalesce(copy()); return manage(res); } -isl::union_pw_aff multi_union_pw_aff::apply_aff(isl::aff aff) const +class size multi_union_pw_aff::dim(isl::dim type) const { - auto res = isl_multi_union_pw_aff_apply_aff(copy(), aff.release()); + auto res = isl_multi_union_pw_aff_dim(get(), static_cast(type)); return manage(res); } -isl::union_pw_aff multi_union_pw_aff::apply_pw_aff(isl::pw_aff pa) const +isl::union_set multi_union_pw_aff::domain() const { - auto res = isl_multi_union_pw_aff_apply_pw_aff(copy(), pa.release()); + auto res = isl_multi_union_pw_aff_domain(copy()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::apply_pw_multi_aff(isl::pw_multi_aff pma) const +isl::multi_union_pw_aff multi_union_pw_aff::flat_range_product(isl::multi_union_pw_aff multi2) const { - auto res = isl_multi_union_pw_aff_apply_pw_multi_aff(copy(), pma.release()); + auto res = isl_multi_union_pw_aff_flat_range_product(copy(), multi2.release()); return manage(res); } -isl::union_set multi_union_pw_aff::bind(isl::multi_id tuple) const +isl::multi_union_pw_aff multi_union_pw_aff::from_union_map(isl::union_map umap) { - auto res = isl_multi_union_pw_aff_bind(copy(), tuple.release()); + auto res = isl_multi_union_pw_aff_from_union_map(umap.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::coalesce() const +isl::multi_union_pw_aff multi_union_pw_aff::gist(isl::union_set context) const { - auto res = isl_multi_union_pw_aff_coalesce(copy()); + auto res = isl_multi_union_pw_aff_gist(copy(), context.release()); return manage(res); } -isl_size multi_union_pw_aff::dim(isl::dim type) const +boolean multi_union_pw_aff::has_range_tuple_id() const { - auto res = isl_multi_union_pw_aff_dim(get(), static_cast(type)); - return res; + auto res = isl_multi_union_pw_aff_has_range_tuple_id(get()); + return manage(res); } -isl::union_set multi_union_pw_aff::domain() const +isl::multi_union_pw_aff multi_union_pw_aff::intersect_domain(isl::union_set uset) const { - auto res = isl_multi_union_pw_aff_domain(copy()); + auto res = isl_multi_union_pw_aff_intersect_domain(copy(), uset.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::multi_union_pw_aff multi_union_pw_aff::intersect_params(isl::set params) const { - auto res = isl_multi_union_pw_aff_drop_dims(copy(), static_cast(type), first, n); + auto res = isl_multi_union_pw_aff_intersect_params(copy(), params.release()); return manage(res); } -isl::multi_pw_aff multi_union_pw_aff::extract_multi_pw_aff(isl::space space) const +boolean multi_union_pw_aff::involves_nan() const { - auto res = isl_multi_union_pw_aff_extract_multi_pw_aff(get(), space.release()); + auto res = isl_multi_union_pw_aff_involves_nan(get()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::factor_range() const +isl::union_pw_aff_list multi_union_pw_aff::list() const { - auto res = isl_multi_union_pw_aff_factor_range(copy()); + auto res = isl_multi_union_pw_aff_get_list(get()); return manage(res); } -int multi_union_pw_aff::find_dim_by_id(isl::dim type, const isl::id &id) const +isl::union_pw_aff_list multi_union_pw_aff::get_list() const { - auto res = isl_multi_union_pw_aff_find_dim_by_id(get(), static_cast(type), id.get()); - return res; + return list(); } -int multi_union_pw_aff::find_dim_by_name(isl::dim type, const std::string &name) const +isl::multi_union_pw_aff multi_union_pw_aff::neg() const { - auto res = isl_multi_union_pw_aff_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + auto res = isl_multi_union_pw_aff_neg(copy()); + return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::flat_range_product(isl::multi_union_pw_aff multi2) const +boolean multi_union_pw_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_multi_union_pw_aff_flat_range_product(copy(), multi2.release()); + auto res = isl_multi_union_pw_aff_plain_is_equal(get(), multi2.get()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::flatten_range() const +isl::multi_union_pw_aff multi_union_pw_aff::pullback(isl::union_pw_multi_aff upma) const { - auto res = isl_multi_union_pw_aff_flatten_range(copy()); + auto res = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(copy(), upma.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::floor() const +isl::multi_union_pw_aff multi_union_pw_aff::range_product(isl::multi_union_pw_aff multi2) const { - auto res = isl_multi_union_pw_aff_floor(copy()); + auto res = isl_multi_union_pw_aff_range_product(copy(), multi2.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::from_multi_aff(isl::multi_aff ma) +isl::id multi_union_pw_aff::range_tuple_id() const { - auto res = isl_multi_union_pw_aff_from_multi_aff(ma.release()); + auto res = isl_multi_union_pw_aff_get_range_tuple_id(get()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::from_range() const +isl::id multi_union_pw_aff::get_range_tuple_id() const +{ + return range_tuple_id(); +} + +isl::multi_union_pw_aff multi_union_pw_aff::reset_range_tuple_id() const { - auto res = isl_multi_union_pw_aff_from_range(copy()); + auto res = isl_multi_union_pw_aff_reset_range_tuple_id(copy()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::from_union_map(isl::union_map umap) +isl::multi_union_pw_aff multi_union_pw_aff::reset_tuple_id(isl::dim type) const { - auto res = isl_multi_union_pw_aff_from_union_map(umap.release()); + auto res = isl_multi_union_pw_aff_reset_tuple_id(copy(), static_cast(type)); return manage(res); } -isl::union_pw_aff multi_union_pw_aff::get_at(int pos) const +isl::multi_union_pw_aff multi_union_pw_aff::scale(isl::multi_val mv) const { - auto res = isl_multi_union_pw_aff_get_at(get(), pos); + auto res = isl_multi_union_pw_aff_scale_multi_val(copy(), mv.release()); + return manage(res); +} + +isl::multi_union_pw_aff multi_union_pw_aff::scale(isl::val v) const +{ + auto res = isl_multi_union_pw_aff_scale_val(copy(), v.release()); + return manage(res); +} + +isl::multi_union_pw_aff multi_union_pw_aff::scale(long v) const +{ + return this->scale(isl::val(ctx(), v)); +} + +isl::multi_union_pw_aff multi_union_pw_aff::scale_down(isl::multi_val mv) const +{ + auto res = isl_multi_union_pw_aff_scale_down_multi_val(copy(), mv.release()); + return manage(res); +} + +isl::multi_union_pw_aff multi_union_pw_aff::scale_down(isl::val v) const +{ + auto res = isl_multi_union_pw_aff_scale_down_val(copy(), v.release()); + return manage(res); +} + +isl::multi_union_pw_aff multi_union_pw_aff::scale_down(long v) const +{ + return this->scale_down(isl::val(ctx(), v)); +} + +isl::multi_union_pw_aff multi_union_pw_aff::set_at(int pos, isl::union_pw_aff el) const +{ + auto res = isl_multi_union_pw_aff_set_at(copy(), pos, el.release()); + return manage(res); +} + +isl::multi_union_pw_aff multi_union_pw_aff::set_range_tuple(isl::id id) const +{ + auto res = isl_multi_union_pw_aff_set_range_tuple_id(copy(), id.release()); + return manage(res); +} + +isl::multi_union_pw_aff multi_union_pw_aff::set_range_tuple(const std::string &id) const +{ + return this->set_range_tuple(isl::id(ctx(), id)); +} + +isl::multi_union_pw_aff multi_union_pw_aff::set_union_pw_aff(int pos, isl::union_pw_aff el) const +{ + auto res = isl_multi_union_pw_aff_set_union_pw_aff(copy(), pos, el.release()); + return manage(res); +} + +class size multi_union_pw_aff::size() const +{ + auto res = isl_multi_union_pw_aff_size(get()); + return manage(res); +} + +isl::space multi_union_pw_aff::space() const +{ + auto res = isl_multi_union_pw_aff_get_space(get()); + return manage(res); +} + +isl::space multi_union_pw_aff::get_space() const +{ + return space(); +} + +isl::multi_union_pw_aff multi_union_pw_aff::sub(isl::multi_union_pw_aff multi2) const +{ + auto res = isl_multi_union_pw_aff_sub(copy(), multi2.release()); + return manage(res); +} + +isl::multi_union_pw_aff multi_union_pw_aff::union_add(isl::multi_union_pw_aff mupa2) const +{ + auto res = isl_multi_union_pw_aff_union_add(copy(), mupa2.release()); return manage(res); } -isl::id multi_union_pw_aff::get_dim_id(isl::dim type, unsigned int pos) const +isl::multi_union_pw_aff multi_union_pw_aff::zero(isl::space space) { - auto res = isl_multi_union_pw_aff_get_dim_id(get(), static_cast(type), pos); + auto res = isl_multi_union_pw_aff_zero(space.release()); return manage(res); } -isl::space multi_union_pw_aff::get_domain_space() const -{ - auto res = isl_multi_union_pw_aff_get_domain_space(get()); - return manage(res); +inline std::ostream &operator<<(std::ostream &os, const multi_union_pw_aff &obj) +{ + char *str = isl_multi_union_pw_aff_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} + +// implementations for isl::multi_val +multi_val manage(__isl_take isl_multi_val *ptr) { + return multi_val(ptr); +} +multi_val manage_copy(__isl_keep isl_multi_val *ptr) { + ptr = isl_multi_val_copy(ptr); + return multi_val(ptr); +} + +multi_val::multi_val() + : ptr(nullptr) {} + +multi_val::multi_val(const multi_val &obj) + : ptr(nullptr) +{ + ptr = obj.copy(); +} + +multi_val::multi_val(__isl_take isl_multi_val *ptr) + : ptr(ptr) {} + +multi_val::multi_val(isl::space space, isl::val_list list) +{ + auto res = isl_multi_val_from_val_list(space.release(), list.release()); + ptr = res; +} + +multi_val::multi_val(isl::ctx ctx, const std::string &str) +{ + auto res = isl_multi_val_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} + +multi_val &multi_val::operator=(multi_val obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +multi_val::~multi_val() { + if (ptr) + isl_multi_val_free(ptr); +} + +__isl_give isl_multi_val *multi_val::copy() const & { + return isl_multi_val_copy(ptr); +} + +__isl_keep isl_multi_val *multi_val::get() const { + return ptr; +} + +__isl_give isl_multi_val *multi_val::release() { + isl_multi_val *tmp = ptr; + ptr = nullptr; + return tmp; } -isl::union_pw_aff_list multi_union_pw_aff::get_list() const -{ - auto res = isl_multi_union_pw_aff_get_list(get()); - return manage(res); +bool multi_val::is_null() const { + return ptr == nullptr; } -isl::space multi_union_pw_aff::get_space() const -{ - auto res = isl_multi_union_pw_aff_get_space(get()); - return manage(res); +isl::ctx multi_val::ctx() const { + return isl::ctx(isl_multi_val_get_ctx(ptr)); } -isl::id multi_union_pw_aff::get_tuple_id(isl::dim type) const +isl::multi_val multi_val::add(isl::multi_val multi2) const { - auto res = isl_multi_union_pw_aff_get_tuple_id(get(), static_cast(type)); + auto res = isl_multi_val_add(copy(), multi2.release()); return manage(res); } -std::string multi_union_pw_aff::get_tuple_name(isl::dim type) const -{ - auto res = isl_multi_union_pw_aff_get_tuple_name(get(), static_cast(type)); - std::string tmp(res); - return tmp; -} - -isl::union_pw_aff multi_union_pw_aff::get_union_pw_aff(int pos) const +isl::multi_val multi_val::add(isl::val v) const { - auto res = isl_multi_union_pw_aff_get_union_pw_aff(get(), pos); + auto res = isl_multi_val_add_val(copy(), v.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::gist(isl::union_set context) const +isl::multi_val multi_val::add(long v) const { - auto res = isl_multi_union_pw_aff_gist(copy(), context.release()); - return manage(res); + return this->add(isl::val(ctx(), v)); } -isl::multi_union_pw_aff multi_union_pw_aff::gist_params(isl::set context) const +isl::val multi_val::at(int pos) const { - auto res = isl_multi_union_pw_aff_gist_params(copy(), context.release()); + auto res = isl_multi_val_get_at(get(), pos); return manage(res); } -boolean multi_union_pw_aff::has_tuple_id(isl::dim type) const +isl::val multi_val::get_at(int pos) const { - auto res = isl_multi_union_pw_aff_has_tuple_id(get(), static_cast(type)); - return manage(res); + return at(pos); } -isl::multi_union_pw_aff multi_union_pw_aff::intersect_domain(isl::union_set uset) const +class size multi_val::dim(isl::dim type) const { - auto res = isl_multi_union_pw_aff_intersect_domain(copy(), uset.release()); + auto res = isl_multi_val_dim(get(), static_cast(type)); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::intersect_params(isl::set params) const +isl::multi_val multi_val::flat_range_product(isl::multi_val multi2) const { - auto res = isl_multi_union_pw_aff_intersect_params(copy(), params.release()); + auto res = isl_multi_val_flat_range_product(copy(), multi2.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::intersect_range(isl::set set) const +boolean multi_val::has_range_tuple_id() const { - auto res = isl_multi_union_pw_aff_intersect_range(copy(), set.release()); + auto res = isl_multi_val_has_range_tuple_id(get()); return manage(res); } -boolean multi_union_pw_aff::involves_nan() const +boolean multi_val::involves_nan() const { - auto res = isl_multi_union_pw_aff_involves_nan(get()); + auto res = isl_multi_val_involves_nan(get()); return manage(res); } -isl::multi_val multi_union_pw_aff::max_multi_val() const +isl::val_list multi_val::list() const { - auto res = isl_multi_union_pw_aff_max_multi_val(copy()); + auto res = isl_multi_val_get_list(get()); return manage(res); } -isl::multi_val multi_union_pw_aff::min_multi_val() const +isl::val_list multi_val::get_list() const { - auto res = isl_multi_union_pw_aff_min_multi_val(copy()); - return manage(res); + return list(); } -isl::multi_union_pw_aff multi_union_pw_aff::mod_multi_val(isl::multi_val mv) const +isl::multi_val multi_val::max(isl::multi_val multi2) const { - auto res = isl_multi_union_pw_aff_mod_multi_val(copy(), mv.release()); + auto res = isl_multi_val_max(copy(), multi2.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::multi_aff_on_domain(isl::union_set domain, isl::multi_aff ma) +isl::multi_val multi_val::min(isl::multi_val multi2) const { - auto res = isl_multi_union_pw_aff_multi_aff_on_domain(domain.release(), ma.release()); + auto res = isl_multi_val_min(copy(), multi2.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::multi_val_on_domain(isl::union_set domain, isl::multi_val mv) +isl::multi_val multi_val::neg() const { - auto res = isl_multi_union_pw_aff_multi_val_on_domain(domain.release(), mv.release()); + auto res = isl_multi_val_neg(copy()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::neg() const +boolean multi_val::plain_is_equal(const isl::multi_val &multi2) const { - auto res = isl_multi_union_pw_aff_neg(copy()); + auto res = isl_multi_val_plain_is_equal(get(), multi2.get()); return manage(res); } -boolean multi_union_pw_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const +isl::multi_val multi_val::product(isl::multi_val multi2) const { - auto res = isl_multi_union_pw_aff_plain_is_equal(get(), multi2.get()); + auto res = isl_multi_val_product(copy(), multi2.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::pullback(isl::union_pw_multi_aff upma) const +isl::multi_val multi_val::range_product(isl::multi_val multi2) const { - auto res = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(copy(), upma.release()); + auto res = isl_multi_val_range_product(copy(), multi2.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::pw_multi_aff_on_domain(isl::union_set domain, isl::pw_multi_aff pma) +isl::id multi_val::range_tuple_id() const { - auto res = isl_multi_union_pw_aff_pw_multi_aff_on_domain(domain.release(), pma.release()); + auto res = isl_multi_val_get_range_tuple_id(get()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::range_factor_domain() const +isl::id multi_val::get_range_tuple_id() const { - auto res = isl_multi_union_pw_aff_range_factor_domain(copy()); - return manage(res); + return range_tuple_id(); } -isl::multi_union_pw_aff multi_union_pw_aff::range_factor_range() const +isl::multi_val multi_val::reset_range_tuple_id() const { - auto res = isl_multi_union_pw_aff_range_factor_range(copy()); + auto res = isl_multi_val_reset_range_tuple_id(copy()); return manage(res); } -boolean multi_union_pw_aff::range_is_wrapping() const +isl::multi_val multi_val::reset_tuple_id(isl::dim type) const { - auto res = isl_multi_union_pw_aff_range_is_wrapping(get()); + auto res = isl_multi_val_reset_tuple_id(copy(), static_cast(type)); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::range_product(isl::multi_union_pw_aff multi2) const +isl::multi_val multi_val::scale(isl::multi_val mv) const { - auto res = isl_multi_union_pw_aff_range_product(copy(), multi2.release()); + auto res = isl_multi_val_scale_multi_val(copy(), mv.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::range_splice(unsigned int pos, isl::multi_union_pw_aff multi2) const +isl::multi_val multi_val::scale(isl::val v) const { - auto res = isl_multi_union_pw_aff_range_splice(copy(), pos, multi2.release()); + auto res = isl_multi_val_scale_val(copy(), v.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::reset_tuple_id(isl::dim type) const +isl::multi_val multi_val::scale(long v) const { - auto res = isl_multi_union_pw_aff_reset_tuple_id(copy(), static_cast(type)); - return manage(res); + return this->scale(isl::val(ctx(), v)); } -isl::multi_union_pw_aff multi_union_pw_aff::reset_user() const +isl::multi_val multi_val::scale_down(isl::multi_val mv) const { - auto res = isl_multi_union_pw_aff_reset_user(copy()); + auto res = isl_multi_val_scale_down_multi_val(copy(), mv.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::scale(isl::multi_val mv) const +isl::multi_val multi_val::scale_down(isl::val v) const { - auto res = isl_multi_union_pw_aff_scale_multi_val(copy(), mv.release()); + auto res = isl_multi_val_scale_down_val(copy(), v.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::scale(isl::val v) const +isl::multi_val multi_val::scale_down(long v) const { - auto res = isl_multi_union_pw_aff_scale_val(copy(), v.release()); - return manage(res); + return this->scale_down(isl::val(ctx(), v)); } -isl::multi_union_pw_aff multi_union_pw_aff::scale_down(isl::multi_val mv) const +isl::multi_val multi_val::set_at(int pos, isl::val el) const { - auto res = isl_multi_union_pw_aff_scale_down_multi_val(copy(), mv.release()); + auto res = isl_multi_val_set_at(copy(), pos, el.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::scale_down(isl::val v) const +isl::multi_val multi_val::set_at(int pos, long el) const { - auto res = isl_multi_union_pw_aff_scale_down_val(copy(), v.release()); - return manage(res); + return this->set_at(pos, isl::val(ctx(), el)); } -isl::multi_union_pw_aff multi_union_pw_aff::set_at(int pos, isl::union_pw_aff el) const +isl::multi_val multi_val::set_range_tuple(isl::id id) const { - auto res = isl_multi_union_pw_aff_set_at(copy(), pos, el.release()); + auto res = isl_multi_val_set_range_tuple_id(copy(), id.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const +isl::multi_val multi_val::set_range_tuple(const std::string &id) const { - auto res = isl_multi_union_pw_aff_set_dim_id(copy(), static_cast(type), pos, id.release()); - return manage(res); + return this->set_range_tuple(isl::id(ctx(), id)); } -isl::multi_union_pw_aff multi_union_pw_aff::set_tuple_id(isl::dim type, isl::id id) const +isl::multi_val multi_val::set_val(int pos, isl::val el) const { - auto res = isl_multi_union_pw_aff_set_tuple_id(copy(), static_cast(type), id.release()); + auto res = isl_multi_val_set_val(copy(), pos, el.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::set_tuple_name(isl::dim type, const std::string &s) const +isl::multi_val multi_val::set_val(int pos, long el) const { - auto res = isl_multi_union_pw_aff_set_tuple_name(copy(), static_cast(type), s.c_str()); - return manage(res); + return this->set_val(pos, isl::val(ctx(), el)); } -isl::multi_union_pw_aff multi_union_pw_aff::set_union_pw_aff(int pos, isl::union_pw_aff el) const +class size multi_val::size() const { - auto res = isl_multi_union_pw_aff_set_union_pw_aff(copy(), pos, el.release()); + auto res = isl_multi_val_size(get()); return manage(res); } -isl_size multi_union_pw_aff::size() const +isl::space multi_val::space() const { - auto res = isl_multi_union_pw_aff_size(get()); - return res; + auto res = isl_multi_val_get_space(get()); + return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::sub(isl::multi_union_pw_aff multi2) const +isl::space multi_val::get_space() const { - auto res = isl_multi_union_pw_aff_sub(copy(), multi2.release()); - return manage(res); + return space(); } -isl::multi_union_pw_aff multi_union_pw_aff::union_add(isl::multi_union_pw_aff mupa2) const +isl::multi_val multi_val::sub(isl::multi_val multi2) const { - auto res = isl_multi_union_pw_aff_union_add(copy(), mupa2.release()); + auto res = isl_multi_val_sub(copy(), multi2.release()); return manage(res); } -isl::multi_union_pw_aff multi_union_pw_aff::zero(isl::space space) +isl::multi_val multi_val::zero(isl::space space) { - auto res = isl_multi_union_pw_aff_zero(space.release()); + auto res = isl_multi_val_zero(space.release()); return manage(res); } -isl::union_set multi_union_pw_aff::zero_union_set() const +inline std::ostream &operator<<(std::ostream &os, const multi_val &obj) { - auto res = isl_multi_union_pw_aff_zero_union_set(copy()); - return manage(res); + char *str = isl_multi_val_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -// implementations for isl::multi_val -multi_val manage(__isl_take isl_multi_val *ptr) { - return multi_val(ptr); +// implementations for isl::point +point manage(__isl_take isl_point *ptr) { + return point(ptr); } -multi_val manage_copy(__isl_keep isl_multi_val *ptr) { - ptr = isl_multi_val_copy(ptr); - return multi_val(ptr); +point manage_copy(__isl_keep isl_point *ptr) { + ptr = isl_point_copy(ptr); + return point(ptr); } -multi_val::multi_val() +point::point() : ptr(nullptr) {} -multi_val::multi_val(const multi_val &obj) +point::point(const point &obj) : ptr(nullptr) { ptr = obj.copy(); } - -multi_val::multi_val(__isl_take isl_multi_val *ptr) +point::point(__isl_take isl_point *ptr) : ptr(ptr) {} -multi_val::multi_val(isl::space space, isl::val_list list) -{ - auto res = isl_multi_val_from_val_list(space.release(), list.release()); - ptr = res; -} -multi_val::multi_val(isl::ctx ctx, const std::string &str) +point::point(isl::space space) { - auto res = isl_multi_val_read_from_str(ctx.release(), str.c_str()); + auto res = isl_point_zero(space.release()); ptr = res; } -multi_val &multi_val::operator=(multi_val obj) { +point &point::operator=(point obj) { std::swap(this->ptr, obj.ptr); return *this; } -multi_val::~multi_val() { +point::~point() { if (ptr) - isl_multi_val_free(ptr); + isl_point_free(ptr); } -__isl_give isl_multi_val *multi_val::copy() const & { - return isl_multi_val_copy(ptr); +__isl_give isl_point *point::copy() const & { + return isl_point_copy(ptr); } -__isl_keep isl_multi_val *multi_val::get() const { +__isl_keep isl_point *point::get() const { return ptr; } -__isl_give isl_multi_val *multi_val::release() { - isl_multi_val *tmp = ptr; +__isl_give isl_point *point::release() { + isl_point *tmp = ptr; ptr = nullptr; return tmp; } -bool multi_val::is_null() const { +bool point::is_null() const { return ptr == nullptr; } +isl::ctx point::ctx() const { + return isl::ctx(isl_point_get_ctx(ptr)); +} -isl::ctx multi_val::ctx() const { - return isl::ctx(isl_multi_val_get_ctx(ptr)); +isl::set point::add_constraint(const isl::constraint &constraint) const +{ + return isl::basic_set(*this).add_constraint(constraint); } -void multi_val::dump() const { - isl_multi_val_dump(get()); +isl::set point::add_dims(isl::dim type, unsigned int n) const +{ + return isl::basic_set(*this).add_dims(type, n); } +isl::basic_set point::affine_hull() const +{ + return isl::basic_set(*this).affine_hull(); +} -isl::multi_val multi_val::add(isl::multi_val multi2) const +isl::set point::align_params(const isl::space &model) const { - auto res = isl_multi_val_add(copy(), multi2.release()); - return manage(res); + return isl::basic_set(*this).align_params(model); } -isl::multi_val multi_val::add(isl::val v) const +isl::basic_set point::apply(const isl::basic_map &bmap) const { - auto res = isl_multi_val_add_val(copy(), v.release()); - return manage(res); + return isl::basic_set(*this).apply(bmap); } -isl::multi_val multi_val::add_dims(isl::dim type, unsigned int n) const +isl::set point::apply(const isl::map &map) const { - auto res = isl_multi_val_add_dims(copy(), static_cast(type), n); - return manage(res); + return isl::basic_set(*this).apply(map); } -isl::multi_val multi_val::align_params(isl::space model) const +isl::union_set point::apply(const isl::union_map &umap) const { - auto res = isl_multi_val_align_params(copy(), model.release()); - return manage(res); + return isl::basic_set(*this).apply(umap); } -isl_size multi_val::dim(isl::dim type) const +isl::pw_multi_aff point::as_pw_multi_aff() const { - auto res = isl_multi_val_dim(get(), static_cast(type)); - return res; + return isl::basic_set(*this).as_pw_multi_aff(); } -isl::multi_val multi_val::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::set point::as_set() const { - auto res = isl_multi_val_drop_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::basic_set(*this).as_set(); } -isl::multi_val multi_val::factor_range() const +isl::basic_set_list point::basic_set_list() const { - auto res = isl_multi_val_factor_range(copy()); - return manage(res); + return isl::basic_set(*this).basic_set_list(); } -int multi_val::find_dim_by_id(isl::dim type, const isl::id &id) const +isl::set point::bind(const isl::multi_id &tuple) const { - auto res = isl_multi_val_find_dim_by_id(get(), static_cast(type), id.get()); - return res; + return isl::basic_set(*this).bind(tuple); } -int multi_val::find_dim_by_name(isl::dim type, const std::string &name) const +isl::set point::coalesce() const { - auto res = isl_multi_val_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + return isl::basic_set(*this).coalesce(); } -isl::multi_val multi_val::flat_range_product(isl::multi_val multi2) const +isl::set point::complement() const { - auto res = isl_multi_val_flat_range_product(copy(), multi2.release()); - return manage(res); + return isl::basic_set(*this).complement(); } -isl::multi_val multi_val::flatten_range() const +isl::union_set point::compute_divs() const { - auto res = isl_multi_val_flatten_range(copy()); - return manage(res); + return isl::basic_set(*this).compute_divs(); } -isl::multi_val multi_val::from_range() const +boolean point::contains(const isl::space &space) const { - auto res = isl_multi_val_from_range(copy()); - return manage(res); + return isl::basic_set(*this).contains(space); } -isl::val multi_val::get_at(int pos) const +isl::basic_set point::convex_hull() const { - auto res = isl_multi_val_get_at(get(), pos); - return manage(res); + return isl::basic_set(*this).convex_hull(); } -isl::id multi_val::get_dim_id(isl::dim type, unsigned int pos) const +isl::val point::coordinate_val(isl::dim type, int pos) const { - auto res = isl_multi_val_get_dim_id(get(), static_cast(type), pos); + auto res = isl_point_get_coordinate_val(get(), static_cast(type), pos); return manage(res); } -isl::space multi_val::get_domain_space() const +isl::val point::get_coordinate_val(isl::dim type, int pos) const { - auto res = isl_multi_val_get_domain_space(get()); - return manage(res); + return coordinate_val(type, pos); } -isl::val_list multi_val::get_list() const +isl::basic_set point::detect_equalities() const { - auto res = isl_multi_val_get_list(get()); - return manage(res); + return isl::basic_set(*this).detect_equalities(); } -isl::space multi_val::get_space() const +class size point::dim(isl::dim type) const { - auto res = isl_multi_val_get_space(get()); - return manage(res); + return isl::basic_set(*this).dim(type); } -isl::id multi_val::get_tuple_id(isl::dim type) const +boolean point::dim_has_any_lower_bound(isl::dim type, unsigned int pos) const { - auto res = isl_multi_val_get_tuple_id(get(), static_cast(type)); - return manage(res); + return isl::basic_set(*this).dim_has_any_lower_bound(type, pos); } -std::string multi_val::get_tuple_name(isl::dim type) const +isl::id point::dim_id(isl::dim type, unsigned int pos) const { - auto res = isl_multi_val_get_tuple_name(get(), static_cast(type)); - std::string tmp(res); - return tmp; + return isl::basic_set(*this).dim_id(type, pos); } -isl::val multi_val::get_val(int pos) const +isl::pw_aff point::dim_max(int pos) const { - auto res = isl_multi_val_get_val(get(), pos); - return manage(res); + return isl::basic_set(*this).dim_max(pos); } -boolean multi_val::has_tuple_id(isl::dim type) const +isl::val point::dim_max_val(int pos) const { - auto res = isl_multi_val_has_tuple_id(get(), static_cast(type)); - return manage(res); + return isl::basic_set(*this).dim_max_val(pos); } -isl::multi_val multi_val::insert_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::pw_aff point::dim_min(int pos) const { - auto res = isl_multi_val_insert_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::basic_set(*this).dim_min(pos); } -boolean multi_val::involves_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::val point::dim_min_val(int pos) const { - auto res = isl_multi_val_involves_dims(get(), static_cast(type), first, n); - return manage(res); + return isl::basic_set(*this).dim_min_val(pos); } -boolean multi_val::involves_nan() const +std::string point::dim_name(isl::dim type, unsigned int pos) const { - auto res = isl_multi_val_involves_nan(get()); - return manage(res); + return isl::basic_set(*this).dim_name(type, pos); } -boolean multi_val::is_zero() const +isl::aff point::div(int pos) const { - auto res = isl_multi_val_is_zero(get()); - return manage(res); + return isl::basic_set(*this).div(pos); } -isl::multi_val multi_val::max(isl::multi_val multi2) const +isl::set point::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_multi_val_max(copy(), multi2.release()); - return manage(res); + return isl::basic_set(*this).drop_constraints_involving_dims(type, first, n); } -isl::multi_val multi_val::min(isl::multi_val multi2) const +isl::set point::eliminate(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_multi_val_min(copy(), multi2.release()); - return manage(res); + return isl::basic_set(*this).eliminate(type, first, n); } -isl::multi_val multi_val::mod_multi_val(isl::multi_val mv) const +boolean point::every_set(const std::function &test) const { - auto res = isl_multi_val_mod_multi_val(copy(), mv.release()); - return manage(res); + return isl::basic_set(*this).every_set(test); } -isl::multi_val multi_val::mod_val(isl::val v) const +isl::set point::extract_set(const isl::space &space) const { - auto res = isl_multi_val_mod_val(copy(), v.release()); - return manage(res); + return isl::basic_set(*this).extract_set(space); } -isl::multi_val multi_val::neg() const +int point::find_dim_by_id(isl::dim type, const isl::id &id) const { - auto res = isl_multi_val_neg(copy()); - return manage(res); + return isl::basic_set(*this).find_dim_by_id(type, id); } -boolean multi_val::plain_is_equal(const isl::multi_val &multi2) const +int point::find_dim_by_id(isl::dim type, const std::string &id) const { - auto res = isl_multi_val_plain_is_equal(get(), multi2.get()); - return manage(res); + return this->find_dim_by_id(type, isl::id(ctx(), id)); } -isl::multi_val multi_val::product(isl::multi_val multi2) const +isl::basic_set point::fix_si(isl::dim type, unsigned int pos, int value) const { - auto res = isl_multi_val_product(copy(), multi2.release()); - return manage(res); + return isl::basic_set(*this).fix_si(type, pos, value); +} + +isl::basic_set point::fix_val(isl::dim type, unsigned int pos, const isl::val &v) const +{ + return isl::basic_set(*this).fix_val(type, pos, v); +} + +isl::basic_set point::fix_val(isl::dim type, unsigned int pos, long v) const +{ + return this->fix_val(type, pos, isl::val(ctx(), v)); +} + +isl::basic_set point::flatten() const +{ + return isl::basic_set(*this).flatten(); +} + +stat point::foreach_basic_set(const std::function &fn) const +{ + return isl::basic_set(*this).foreach_basic_set(fn); +} + +stat point::foreach_point(const std::function &fn) const +{ + return isl::basic_set(*this).foreach_point(fn); +} + +stat point::foreach_set(const std::function &fn) const +{ + return isl::basic_set(*this).foreach_set(fn); +} + +isl::basic_set point::gist(const isl::basic_set &context) const +{ + return isl::basic_set(*this).gist(context); +} + +isl::set point::gist(const isl::set &context) const +{ + return isl::basic_set(*this).gist(context); } -isl::multi_val multi_val::project_domain_on_params() const +isl::union_set point::gist(const isl::union_set &context) const { - auto res = isl_multi_val_project_domain_on_params(copy()); - return manage(res); + return isl::basic_set(*this).gist(context); } -isl::multi_val multi_val::range_factor_domain() const +isl::set point::gist_params(const isl::set &context) const { - auto res = isl_multi_val_range_factor_domain(copy()); - return manage(res); + return isl::basic_set(*this).gist_params(context); } -isl::multi_val multi_val::range_factor_range() const +boolean point::has_equal_space(const isl::set &set2) const { - auto res = isl_multi_val_range_factor_range(copy()); - return manage(res); + return isl::basic_set(*this).has_equal_space(set2); } -boolean multi_val::range_is_wrapping() const +isl::map point::identity() const { - auto res = isl_multi_val_range_is_wrapping(get()); - return manage(res); + return isl::basic_set(*this).identity(); } -isl::multi_val multi_val::range_product(isl::multi_val multi2) const +isl::union_pw_multi_aff point::identity_union_pw_multi_aff() const { - auto res = isl_multi_val_range_product(copy(), multi2.release()); - return manage(res); + return isl::basic_set(*this).identity_union_pw_multi_aff(); } -isl::multi_val multi_val::range_splice(unsigned int pos, isl::multi_val multi2) const +isl::pw_aff point::indicator_function() const { - auto res = isl_multi_val_range_splice(copy(), pos, multi2.release()); - return manage(res); + return isl::basic_set(*this).indicator_function(); } -isl::multi_val multi_val::reset_tuple_id(isl::dim type) const +isl::set point::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const { - auto res = isl_multi_val_reset_tuple_id(copy(), static_cast(type)); - return manage(res); + return isl::basic_set(*this).insert_dims(type, pos, n); } -isl::multi_val multi_val::reset_user() const +isl::map point::insert_domain(const isl::space &domain) const { - auto res = isl_multi_val_reset_user(copy()); - return manage(res); + return isl::basic_set(*this).insert_domain(domain); } -isl::multi_val multi_val::scale(isl::multi_val mv) const +isl::basic_set point::intersect(const isl::basic_set &bset2) const { - auto res = isl_multi_val_scale_multi_val(copy(), mv.release()); - return manage(res); + return isl::basic_set(*this).intersect(bset2); } -isl::multi_val multi_val::scale(isl::val v) const +isl::set point::intersect(const isl::set &set2) const { - auto res = isl_multi_val_scale_val(copy(), v.release()); - return manage(res); + return isl::basic_set(*this).intersect(set2); } -isl::multi_val multi_val::scale_down(isl::multi_val mv) const +isl::union_set point::intersect(const isl::union_set &uset2) const { - auto res = isl_multi_val_scale_down_multi_val(copy(), mv.release()); - return manage(res); + return isl::basic_set(*this).intersect(uset2); } -isl::multi_val multi_val::scale_down(isl::val v) const +isl::basic_set point::intersect_params(const isl::basic_set &bset2) const { - auto res = isl_multi_val_scale_down_val(copy(), v.release()); - return manage(res); + return isl::basic_set(*this).intersect_params(bset2); } -isl::multi_val multi_val::set_at(int pos, isl::val el) const +isl::set point::intersect_params(const isl::set ¶ms) const { - auto res = isl_multi_val_set_at(copy(), pos, el.release()); - return manage(res); + return isl::basic_set(*this).intersect_params(params); } -isl::multi_val multi_val::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const +boolean point::involves_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_multi_val_set_dim_id(copy(), static_cast(type), pos, id.release()); - return manage(res); + return isl::basic_set(*this).involves_dims(type, first, n); } -isl::multi_val multi_val::set_tuple_id(isl::dim type, isl::id id) const +boolean point::involves_locals() const { - auto res = isl_multi_val_set_tuple_id(copy(), static_cast(type), id.release()); - return manage(res); + return isl::basic_set(*this).involves_locals(); } -isl::multi_val multi_val::set_tuple_name(isl::dim type, const std::string &s) const +boolean point::is_bounded() const { - auto res = isl_multi_val_set_tuple_name(copy(), static_cast(type), s.c_str()); - return manage(res); + return isl::basic_set(*this).is_bounded(); } -isl::multi_val multi_val::set_val(int pos, isl::val el) const +boolean point::is_disjoint(const isl::set &set2) const { - auto res = isl_multi_val_set_val(copy(), pos, el.release()); - return manage(res); + return isl::basic_set(*this).is_disjoint(set2); } -isl_size multi_val::size() const +boolean point::is_disjoint(const isl::union_set &uset2) const { - auto res = isl_multi_val_size(get()); - return res; + return isl::basic_set(*this).is_disjoint(uset2); } -isl::multi_val multi_val::splice(unsigned int in_pos, unsigned int out_pos, isl::multi_val multi2) const +boolean point::is_empty() const { - auto res = isl_multi_val_splice(copy(), in_pos, out_pos, multi2.release()); - return manage(res); + return isl::basic_set(*this).is_empty(); } -isl::multi_val multi_val::sub(isl::multi_val multi2) const +boolean point::is_equal(const isl::basic_set &bset2) const { - auto res = isl_multi_val_sub(copy(), multi2.release()); - return manage(res); + return isl::basic_set(*this).is_equal(bset2); } -isl::multi_val multi_val::zero(isl::space space) +boolean point::is_equal(const isl::set &set2) const { - auto res = isl_multi_val_zero(space.release()); - return manage(res); + return isl::basic_set(*this).is_equal(set2); } -// implementations for isl::point -point manage(__isl_take isl_point *ptr) { - return point(ptr); -} -point manage_copy(__isl_keep isl_point *ptr) { - ptr = isl_point_copy(ptr); - return point(ptr); +boolean point::is_equal(const isl::union_set &uset2) const +{ + return isl::basic_set(*this).is_equal(uset2); } -point::point() - : ptr(nullptr) {} - -point::point(const point &obj) - : ptr(nullptr) +boolean point::is_params() const { - ptr = obj.copy(); + return isl::basic_set(*this).is_params(); } - -point::point(__isl_take isl_point *ptr) - : ptr(ptr) {} - -point::point(isl::space dim) +boolean point::is_singleton() const { - auto res = isl_point_zero(dim.release()); - ptr = res; + return isl::basic_set(*this).is_singleton(); } -point &point::operator=(point obj) { - std::swap(this->ptr, obj.ptr); - return *this; +boolean point::is_strict_subset(const isl::set &set2) const +{ + return isl::basic_set(*this).is_strict_subset(set2); } -point::~point() { - if (ptr) - isl_point_free(ptr); +boolean point::is_strict_subset(const isl::union_set &uset2) const +{ + return isl::basic_set(*this).is_strict_subset(uset2); } -__isl_give isl_point *point::copy() const & { - return isl_point_copy(ptr); +boolean point::is_subset(const isl::basic_set &bset2) const +{ + return isl::basic_set(*this).is_subset(bset2); } -__isl_keep isl_point *point::get() const { - return ptr; +boolean point::is_subset(const isl::set &set2) const +{ + return isl::basic_set(*this).is_subset(set2); } -__isl_give isl_point *point::release() { - isl_point *tmp = ptr; - ptr = nullptr; - return tmp; +boolean point::is_subset(const isl::union_set &uset2) const +{ + return isl::basic_set(*this).is_subset(uset2); } -bool point::is_null() const { - return ptr == nullptr; +boolean point::is_wrapping() const +{ + return isl::basic_set(*this).is_wrapping(); } - -isl::ctx point::ctx() const { - return isl::ctx(isl_point_get_ctx(ptr)); +boolean point::isa_set() const +{ + return isl::basic_set(*this).isa_set(); } -void point::dump() const { - isl_point_dump(get()); +isl::set point::lexmax() const +{ + return isl::basic_set(*this).lexmax(); } - -isl::point point::add_ui(isl::dim type, int pos, unsigned int val) const +isl::pw_multi_aff point::lexmax_pw_multi_aff() const { - auto res = isl_point_add_ui(copy(), static_cast(type), pos, val); - return manage(res); + return isl::basic_set(*this).lexmax_pw_multi_aff(); } -isl::val point::get_coordinate_val(isl::dim type, int pos) const +isl::set point::lexmin() const { - auto res = isl_point_get_coordinate_val(get(), static_cast(type), pos); - return manage(res); + return isl::basic_set(*this).lexmin(); } -isl::multi_val point::get_multi_val() const +isl::pw_multi_aff point::lexmin_pw_multi_aff() const { - auto res = isl_point_get_multi_val(get()); - return manage(res); + return isl::basic_set(*this).lexmin_pw_multi_aff(); } -isl::space point::get_space() const +isl::set point::lower_bound(const isl::multi_pw_aff &lower) const { - auto res = isl_point_get_space(get()); - return manage(res); + return isl::basic_set(*this).lower_bound(lower); } -isl::point point::set_coordinate_val(isl::dim type, int pos, isl::val v) const +isl::set point::lower_bound(const isl::multi_val &lower) const { - auto res = isl_point_set_coordinate_val(copy(), static_cast(type), pos, v.release()); - return manage(res); + return isl::basic_set(*this).lower_bound(lower); } -isl::point point::sub_ui(isl::dim type, int pos, unsigned int val) const +isl::set point::lower_bound_si(isl::dim type, unsigned int pos, int value) const { - auto res = isl_point_sub_ui(copy(), static_cast(type), pos, val); - return manage(res); + return isl::basic_set(*this).lower_bound_si(type, pos, value); } -// implementations for isl::pw_aff -pw_aff manage(__isl_take isl_pw_aff *ptr) { - return pw_aff(ptr); +isl::set point::lower_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const +{ + return isl::basic_set(*this).lower_bound_val(type, pos, value); } -pw_aff manage_copy(__isl_keep isl_pw_aff *ptr) { - ptr = isl_pw_aff_copy(ptr); - return pw_aff(ptr); + +isl::set point::lower_bound_val(isl::dim type, unsigned int pos, long value) const +{ + return this->lower_bound_val(type, pos, isl::val(ctx(), value)); } -pw_aff::pw_aff() - : ptr(nullptr) {} +isl::multi_pw_aff point::max_multi_pw_aff() const +{ + return isl::basic_set(*this).max_multi_pw_aff(); +} -pw_aff::pw_aff(const pw_aff &obj) - : ptr(nullptr) +isl::val point::max_val(const isl::aff &obj) const { - ptr = obj.copy(); + return isl::basic_set(*this).max_val(obj); } +isl::multi_pw_aff point::min_multi_pw_aff() const +{ + return isl::basic_set(*this).min_multi_pw_aff(); +} -pw_aff::pw_aff(__isl_take isl_pw_aff *ptr) - : ptr(ptr) {} +isl::val point::min_val(const isl::aff &obj) const +{ + return isl::basic_set(*this).min_val(obj); +} -pw_aff::pw_aff(isl::aff aff) +isl::multi_val point::multi_val() const { - auto res = isl_pw_aff_from_aff(aff.release()); - ptr = res; + auto res = isl_point_get_multi_val(get()); + return manage(res); } -pw_aff::pw_aff(isl::ctx ctx, const std::string &str) + +isl::multi_val point::get_multi_val() const { - auto res = isl_pw_aff_read_from_str(ctx.release(), str.c_str()); - ptr = res; + return multi_val(); } -pw_aff::pw_aff(isl::set domain, isl::val v) + +class size point::n_basic_set() const { - auto res = isl_pw_aff_val_on_domain(domain.release(), v.release()); - ptr = res; + return isl::basic_set(*this).n_basic_set(); } -pw_aff::pw_aff(isl::local_space ls) + +isl::basic_set point::params() const { - auto res = isl_pw_aff_zero_on_domain(ls.release()); - ptr = res; + return isl::basic_set(*this).params(); } -pw_aff &pw_aff::operator=(pw_aff obj) { - std::swap(this->ptr, obj.ptr); - return *this; +isl::val point::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const +{ + return isl::basic_set(*this).plain_get_val_if_fixed(type, pos); } -pw_aff::~pw_aff() { - if (ptr) - isl_pw_aff_free(ptr); +isl::multi_val point::plain_multi_val_if_fixed() const +{ + return isl::basic_set(*this).plain_multi_val_if_fixed(); } -__isl_give isl_pw_aff *pw_aff::copy() const & { - return isl_pw_aff_copy(ptr); +isl::basic_set point::polyhedral_hull() const +{ + return isl::basic_set(*this).polyhedral_hull(); } -__isl_keep isl_pw_aff *pw_aff::get() const { - return ptr; +isl::set point::preimage(const isl::multi_aff &ma) const +{ + return isl::basic_set(*this).preimage(ma); } -__isl_give isl_pw_aff *pw_aff::release() { - isl_pw_aff *tmp = ptr; - ptr = nullptr; - return tmp; +isl::set point::preimage(const isl::multi_pw_aff &mpa) const +{ + return isl::basic_set(*this).preimage(mpa); } -bool pw_aff::is_null() const { - return ptr == nullptr; +isl::set point::preimage(const isl::pw_multi_aff &pma) const +{ + return isl::basic_set(*this).preimage(pma); } +isl::union_set point::preimage(const isl::union_pw_multi_aff &upma) const +{ + return isl::basic_set(*this).preimage(upma); +} -isl::ctx pw_aff::ctx() const { - return isl::ctx(isl_pw_aff_get_ctx(ptr)); +isl::set point::product(const isl::set &set2) const +{ + return isl::basic_set(*this).product(set2); } -void pw_aff::dump() const { - isl_pw_aff_dump(get()); +isl::basic_set point::project_out(isl::dim type, unsigned int first, unsigned int n) const +{ + return isl::basic_set(*this).project_out(type, first, n); } +isl::set point::project_out_all_params() const +{ + return isl::basic_set(*this).project_out_all_params(); +} -isl::pw_aff pw_aff::add(isl::pw_aff pwaff2) const +isl::set point::project_out_param(const isl::id &id) const { - auto res = isl_pw_aff_add(copy(), pwaff2.release()); - return manage(res); + return isl::basic_set(*this).project_out_param(id); } -isl::pw_aff pw_aff::add_constant(isl::val v) const +isl::set point::project_out_param(const std::string &id) const { - auto res = isl_pw_aff_add_constant_val(copy(), v.release()); - return manage(res); + return this->project_out_param(isl::id(ctx(), id)); } -isl::pw_aff pw_aff::add_dims(isl::dim type, unsigned int n) const +isl::set point::project_out_param(const isl::id_list &list) const { - auto res = isl_pw_aff_add_dims(copy(), static_cast(type), n); - return manage(res); + return isl::basic_set(*this).project_out_param(list); } -isl::pw_aff pw_aff::align_params(isl::space model) const +isl::pw_multi_aff point::pw_multi_aff_on_domain(const isl::multi_val &mv) const { - auto res = isl_pw_aff_align_params(copy(), model.release()); - return manage(res); + return isl::basic_set(*this).pw_multi_aff_on_domain(mv); } -isl::pw_aff pw_aff::alloc(isl::set set, isl::aff aff) +isl::set point::remove_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_pw_aff_alloc(set.release(), aff.release()); - return manage(res); + return isl::basic_set(*this).remove_dims(type, first, n); } -isl::aff pw_aff::as_aff() const +isl::set point::remove_divs() const { - auto res = isl_pw_aff_as_aff(copy()); - return manage(res); + return isl::basic_set(*this).remove_divs(); } -isl::set pw_aff::bind(isl::id id) const +isl::set point::remove_redundancies() const { - auto res = isl_pw_aff_bind_id(copy(), id.release()); - return manage(res); + return isl::basic_set(*this).remove_redundancies(); } -isl::pw_aff pw_aff::bind_domain(isl::multi_id tuple) const +isl::set point::reset_tuple_id() const { - auto res = isl_pw_aff_bind_domain(copy(), tuple.release()); - return manage(res); + return isl::basic_set(*this).reset_tuple_id(); } -isl::pw_aff pw_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const +isl::basic_set point::sample() const { - auto res = isl_pw_aff_bind_domain_wrapped_domain(copy(), tuple.release()); - return manage(res); + return isl::basic_set(*this).sample(); } -isl::pw_aff pw_aff::ceil() const +isl::point point::sample_point() const { - auto res = isl_pw_aff_ceil(copy()); - return manage(res); + return isl::basic_set(*this).sample_point(); } -isl::pw_aff pw_aff::coalesce() const +isl::set point::set_dim_id(isl::dim type, unsigned int pos, const isl::id &id) const { - auto res = isl_pw_aff_coalesce(copy()); - return manage(res); + return isl::basic_set(*this).set_dim_id(type, pos, id); } -isl::pw_aff pw_aff::cond(isl::pw_aff pwaff_true, isl::pw_aff pwaff_false) const +isl::set point::set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const { - auto res = isl_pw_aff_cond(copy(), pwaff_true.release(), pwaff_false.release()); - return manage(res); + return this->set_dim_id(type, pos, isl::id(ctx(), id)); } -isl_size pw_aff::dim(isl::dim type) const +isl::set_list point::set_list() const { - auto res = isl_pw_aff_dim(get(), static_cast(type)); - return res; + return isl::basic_set(*this).set_list(); } -isl::pw_aff pw_aff::div(isl::pw_aff pa2) const +isl::set point::set_tuple_id(const isl::id &id) const { - auto res = isl_pw_aff_div(copy(), pa2.release()); - return manage(res); + return isl::basic_set(*this).set_tuple_id(id); } -isl::set pw_aff::domain() const +isl::set point::set_tuple_id(const std::string &id) const { - auto res = isl_pw_aff_domain(copy()); - return manage(res); + return this->set_tuple_id(isl::id(ctx(), id)); } -isl::pw_aff pw_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::fixed_box point::simple_fixed_box_hull() const { - auto res = isl_pw_aff_drop_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::basic_set(*this).simple_fixed_box_hull(); } -isl::pw_aff pw_aff::drop_unused_params() const +isl::basic_set point::simple_hull() const { - auto res = isl_pw_aff_drop_unused_params(copy()); - return manage(res); + return isl::basic_set(*this).simple_hull(); } -isl::pw_aff pw_aff::empty(isl::space space) +isl::space point::space() const { - auto res = isl_pw_aff_empty(space.release()); - return manage(res); + return isl::basic_set(*this).space(); } -isl::map pw_aff::eq_map(isl::pw_aff pa2) const +isl::val point::stride(int pos) const { - auto res = isl_pw_aff_eq_map(copy(), pa2.release()); - return manage(res); + return isl::basic_set(*this).stride(pos); } -isl::set pw_aff::eq_set(isl::pw_aff pwaff2) const +isl::set point::subtract(const isl::set &set2) const { - auto res = isl_pw_aff_eq_set(copy(), pwaff2.release()); - return manage(res); + return isl::basic_set(*this).subtract(set2); } -isl::val pw_aff::eval(isl::point pnt) const +isl::union_set point::subtract(const isl::union_set &uset2) const { - auto res = isl_pw_aff_eval(copy(), pnt.release()); - return manage(res); + return isl::basic_set(*this).subtract(uset2); } -int pw_aff::find_dim_by_name(isl::dim type, const std::string &name) const +isl::basic_set_list point::to_list() const { - auto res = isl_pw_aff_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + return isl::basic_set(*this).to_list(); } -isl::pw_aff pw_aff::floor() const +isl::set point::to_set() const { - auto res = isl_pw_aff_floor(copy()); + auto res = isl_point_to_set(copy()); return manage(res); } -stat pw_aff::foreach_piece(const std::function &fn) const +isl::union_set point::to_union_set() const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_set *arg_0, isl_aff *arg_1, void *arg_2) -> isl_stat { - auto *data = static_cast(arg_2); - stat ret = (*data->func)(manage(arg_0), manage(arg_1)); - return ret.release(); - }; - auto res = isl_pw_aff_foreach_piece(get(), fn_lambda, &fn_data); - return manage(res); + return isl::basic_set(*this).to_union_set(); } -isl::pw_aff pw_aff::from_range() const +isl::map point::translation() const { - auto res = isl_pw_aff_from_range(copy()); - return manage(res); + return isl::basic_set(*this).translation(); } -isl::map pw_aff::ge_map(isl::pw_aff pa2) const +class size point::tuple_dim() const { - auto res = isl_pw_aff_ge_map(copy(), pa2.release()); - return manage(res); + return isl::basic_set(*this).tuple_dim(); } -isl::set pw_aff::ge_set(isl::pw_aff pwaff2) const +isl::id point::tuple_id() const { - auto res = isl_pw_aff_ge_set(copy(), pwaff2.release()); - return manage(res); + return isl::basic_set(*this).tuple_id(); } -isl::id pw_aff::get_dim_id(isl::dim type, unsigned int pos) const +std::string point::tuple_name() const { - auto res = isl_pw_aff_get_dim_id(get(), static_cast(type), pos); - return manage(res); + return isl::basic_set(*this).tuple_name(); } -std::string pw_aff::get_dim_name(isl::dim type, unsigned int pos) const +isl::set point::unbind_params(const isl::multi_id &tuple) const { - auto res = isl_pw_aff_get_dim_name(get(), static_cast(type), pos); - std::string tmp(res); - return tmp; + return isl::basic_set(*this).unbind_params(tuple); } -isl::space pw_aff::get_domain_space() const +isl::map point::unbind_params_insert_domain(const isl::multi_id &domain) const { - auto res = isl_pw_aff_get_domain_space(get()); - return manage(res); + return isl::basic_set(*this).unbind_params_insert_domain(domain); } -uint32_t pw_aff::get_hash() const +isl::set point::unite(const isl::basic_set &bset2) const { - auto res = isl_pw_aff_get_hash(get()); - return res; + return isl::basic_set(*this).unite(bset2); } -isl::space pw_aff::get_space() const +isl::set point::unite(const isl::set &set2) const { - auto res = isl_pw_aff_get_space(get()); - return manage(res); + return isl::basic_set(*this).unite(set2); } -isl::id pw_aff::get_tuple_id(isl::dim type) const +isl::union_set point::unite(const isl::union_set &uset2) const { - auto res = isl_pw_aff_get_tuple_id(get(), static_cast(type)); - return manage(res); + return isl::basic_set(*this).unite(uset2); } -isl::pw_aff pw_aff::gist(isl::set context) const +isl::basic_set point::unshifted_simple_hull() const { - auto res = isl_pw_aff_gist(copy(), context.release()); - return manage(res); + return isl::basic_set(*this).unshifted_simple_hull(); } -isl::pw_aff pw_aff::gist_params(isl::set context) const +isl::map point::unwrap() const { - auto res = isl_pw_aff_gist_params(copy(), context.release()); - return manage(res); + return isl::basic_set(*this).unwrap(); } -isl::map pw_aff::gt_map(isl::pw_aff pa2) const +isl::set point::upper_bound(const isl::multi_pw_aff &upper) const { - auto res = isl_pw_aff_gt_map(copy(), pa2.release()); - return manage(res); + return isl::basic_set(*this).upper_bound(upper); } -isl::set pw_aff::gt_set(isl::pw_aff pwaff2) const +isl::set point::upper_bound(const isl::multi_val &upper) const { - auto res = isl_pw_aff_gt_set(copy(), pwaff2.release()); - return manage(res); + return isl::basic_set(*this).upper_bound(upper); } -boolean pw_aff::has_dim_id(isl::dim type, unsigned int pos) const +isl::set point::upper_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const { - auto res = isl_pw_aff_has_dim_id(get(), static_cast(type), pos); - return manage(res); + return isl::basic_set(*this).upper_bound_val(type, pos, value); } -boolean pw_aff::has_tuple_id(isl::dim type) const +isl::set point::upper_bound_val(isl::dim type, unsigned int pos, long value) const { - auto res = isl_pw_aff_has_tuple_id(get(), static_cast(type)); - return manage(res); + return this->upper_bound_val(type, pos, isl::val(ctx(), value)); } -isl::pw_aff pw_aff::insert_dims(isl::dim type, unsigned int first, unsigned int n) const +inline std::ostream &operator<<(std::ostream &os, const point &obj) { - auto res = isl_pw_aff_insert_dims(copy(), static_cast(type), first, n); - return manage(res); + char *str = isl_point_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::pw_aff pw_aff::insert_domain(isl::space domain) const -{ - auto res = isl_pw_aff_insert_domain(copy(), domain.release()); - return manage(res); +// implementations for isl::pw_aff +pw_aff manage(__isl_take isl_pw_aff *ptr) { + return pw_aff(ptr); +} +pw_aff manage_copy(__isl_keep isl_pw_aff *ptr) { + ptr = isl_pw_aff_copy(ptr); + return pw_aff(ptr); } -isl::pw_aff pw_aff::intersect_domain(isl::set set) const +pw_aff::pw_aff() + : ptr(nullptr) {} + +pw_aff::pw_aff(const pw_aff &obj) + : ptr(nullptr) { - auto res = isl_pw_aff_intersect_domain(copy(), set.release()); - return manage(res); + ptr = obj.copy(); } -isl::pw_aff pw_aff::intersect_domain_wrapped_domain(isl::set set) const +pw_aff::pw_aff(__isl_take isl_pw_aff *ptr) + : ptr(ptr) {} + +pw_aff::pw_aff(isl::aff aff) { - auto res = isl_pw_aff_intersect_domain_wrapped_domain(copy(), set.release()); - return manage(res); + auto res = isl_pw_aff_from_aff(aff.release()); + ptr = res; } -isl::pw_aff pw_aff::intersect_domain_wrapped_range(isl::set set) const +pw_aff::pw_aff(isl::ctx ctx, const std::string &str) { - auto res = isl_pw_aff_intersect_domain_wrapped_range(copy(), set.release()); - return manage(res); + auto res = isl_pw_aff_read_from_str(ctx.release(), str.c_str()); + ptr = res; } -isl::pw_aff pw_aff::intersect_params(isl::set set) const +pw_aff::pw_aff(isl::set domain, isl::val v) { - auto res = isl_pw_aff_intersect_params(copy(), set.release()); - return manage(res); + auto res = isl_pw_aff_val_on_domain(domain.release(), v.release()); + ptr = res; } -boolean pw_aff::involves_dims(isl::dim type, unsigned int first, unsigned int n) const +pw_aff::pw_aff(isl::local_space ls) { - auto res = isl_pw_aff_involves_dims(get(), static_cast(type), first, n); - return manage(res); + auto res = isl_pw_aff_zero_on_domain(ls.release()); + ptr = res; } -boolean pw_aff::involves_nan() const -{ - auto res = isl_pw_aff_involves_nan(get()); - return manage(res); +pw_aff &pw_aff::operator=(pw_aff obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -boolean pw_aff::involves_param_id(const isl::id &id) const -{ - auto res = isl_pw_aff_involves_param_id(get(), id.get()); - return manage(res); +pw_aff::~pw_aff() { + if (ptr) + isl_pw_aff_free(ptr); } -boolean pw_aff::is_cst() const -{ - auto res = isl_pw_aff_is_cst(get()); - return manage(res); +__isl_give isl_pw_aff *pw_aff::copy() const & { + return isl_pw_aff_copy(ptr); } -boolean pw_aff::is_empty() const -{ - auto res = isl_pw_aff_is_empty(get()); - return manage(res); +__isl_keep isl_pw_aff *pw_aff::get() const { + return ptr; } -boolean pw_aff::is_equal(const isl::pw_aff &pa2) const -{ - auto res = isl_pw_aff_is_equal(get(), pa2.get()); - return manage(res); +__isl_give isl_pw_aff *pw_aff::release() { + isl_pw_aff *tmp = ptr; + ptr = nullptr; + return tmp; } -boolean pw_aff::isa_aff() const -{ - auto res = isl_pw_aff_isa_aff(get()); - return manage(res); +bool pw_aff::is_null() const { + return ptr == nullptr; } -isl::map pw_aff::le_map(isl::pw_aff pa2) const -{ - auto res = isl_pw_aff_le_map(copy(), pa2.release()); - return manage(res); +isl::ctx pw_aff::ctx() const { + return isl::ctx(isl_pw_aff_get_ctx(ptr)); } -isl::set pw_aff::le_set(isl::pw_aff pwaff2) const +isl::multi_pw_aff pw_aff::add(const isl::multi_pw_aff &multi2) const { - auto res = isl_pw_aff_le_set(copy(), pwaff2.release()); - return manage(res); + return isl::pw_multi_aff(*this).add(multi2); } -isl::map pw_aff::lt_map(isl::pw_aff pa2) const +isl::multi_union_pw_aff pw_aff::add(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_pw_aff_lt_map(copy(), pa2.release()); - return manage(res); + return isl::union_pw_aff(*this).add(multi2); } -isl::set pw_aff::lt_set(isl::pw_aff pwaff2) const +isl::pw_aff pw_aff::add(isl::pw_aff pwaff2) const { - auto res = isl_pw_aff_lt_set(copy(), pwaff2.release()); + auto res = isl_pw_aff_add(copy(), pwaff2.release()); return manage(res); } -isl::pw_aff pw_aff::max(isl::pw_aff pwaff2) const +isl::pw_multi_aff pw_aff::add(const isl::pw_multi_aff &pma2) const { - auto res = isl_pw_aff_max(copy(), pwaff2.release()); - return manage(res); + return isl::pw_multi_aff(*this).add(pma2); } -isl::pw_aff pw_aff::min(isl::pw_aff pwaff2) const +isl::union_pw_aff pw_aff::add(const isl::union_pw_aff &upa2) const { - auto res = isl_pw_aff_min(copy(), pwaff2.release()); - return manage(res); + return isl::union_pw_aff(*this).add(upa2); } -isl::pw_aff pw_aff::mod(isl::val mod) const +isl::union_pw_multi_aff pw_aff::add(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_pw_aff_mod_val(copy(), mod.release()); - return manage(res); + return isl::union_pw_aff(*this).add(upma2); } -isl::pw_aff pw_aff::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const +isl::pw_aff pw_aff::add(const isl::aff &pwaff2) const { - auto res = isl_pw_aff_move_dims(copy(), static_cast(dst_type), dst_pos, static_cast(src_type), src_pos, n); - return manage(res); + return this->add(isl::pw_aff(pwaff2)); } -isl::pw_aff pw_aff::mul(isl::pw_aff pwaff2) const +isl::pw_aff pw_aff::add_constant(isl::val v) const { - auto res = isl_pw_aff_mul(copy(), pwaff2.release()); + auto res = isl_pw_aff_add_constant_val(copy(), v.release()); return manage(res); } -isl_size pw_aff::n_piece() const +isl::pw_aff pw_aff::add_constant(long v) const { - auto res = isl_pw_aff_n_piece(get()); - return res; + return this->add_constant(isl::val(ctx(), v)); } -isl::pw_aff pw_aff::nan_on_domain(isl::local_space ls) +isl::pw_multi_aff pw_aff::add_constant(const isl::multi_val &mv) const { - auto res = isl_pw_aff_nan_on_domain(ls.release()); - return manage(res); + return isl::pw_multi_aff(*this).add_constant(mv); } -isl::pw_aff pw_aff::nan_on_domain_space(isl::space space) +isl::pw_aff pw_aff::add_dims(isl::dim type, unsigned int n) const { - auto res = isl_pw_aff_nan_on_domain_space(space.release()); + auto res = isl_pw_aff_add_dims(copy(), static_cast(type), n); return manage(res); } -isl::set pw_aff::ne_set(isl::pw_aff pwaff2) const +isl::union_pw_multi_aff pw_aff::add_pw_multi_aff(const isl::pw_multi_aff &pma) const { - auto res = isl_pw_aff_ne_set(copy(), pwaff2.release()); - return manage(res); + return isl::union_pw_aff(*this).add_pw_multi_aff(pma); } -isl::pw_aff pw_aff::neg() const +isl::union_pw_multi_aff pw_aff::apply(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_pw_aff_neg(copy()); - return manage(res); + return isl::union_pw_aff(*this).apply(upma2); } -isl::set pw_aff::non_zero_set() const +isl::aff pw_aff::as_aff() const { - auto res = isl_pw_aff_non_zero_set(copy()); + auto res = isl_pw_aff_as_aff(copy()); return manage(res); } -isl::set pw_aff::nonneg_set() const +isl::map pw_aff::as_map() const { - auto res = isl_pw_aff_nonneg_set(copy()); + auto res = isl_pw_aff_as_map(copy()); return manage(res); } -isl::pw_aff pw_aff::param_on_domain(isl::set domain, isl::id id) +isl::multi_aff pw_aff::as_multi_aff() const { - auto res = isl_pw_aff_param_on_domain_id(domain.release(), id.release()); - return manage(res); + return isl::pw_multi_aff(*this).as_multi_aff(); } -isl::set pw_aff::params() const +isl::multi_union_pw_aff pw_aff::as_multi_union_pw_aff() const { - auto res = isl_pw_aff_params(copy()); - return manage(res); + return isl::union_pw_aff(*this).as_multi_union_pw_aff(); } -int pw_aff::plain_cmp(const isl::pw_aff &pa2) const +isl::pw_multi_aff pw_aff::as_pw_multi_aff() const { - auto res = isl_pw_aff_plain_cmp(get(), pa2.get()); - return res; + return isl::union_pw_aff(*this).as_pw_multi_aff(); } -boolean pw_aff::plain_is_equal(const isl::pw_aff &pwaff2) const +isl::set pw_aff::as_set() const { - auto res = isl_pw_aff_plain_is_equal(get(), pwaff2.get()); - return manage(res); + return isl::pw_multi_aff(*this).as_set(); } -isl::set pw_aff::pos_set() const +isl::union_map pw_aff::as_union_map() const { - auto res = isl_pw_aff_pos_set(copy()); - return manage(res); + return isl::union_pw_aff(*this).as_union_map(); } -isl::pw_aff pw_aff::project_domain_on_params() const +isl::pw_aff pw_aff::at(int pos) const { - auto res = isl_pw_aff_project_domain_on_params(copy()); - return manage(res); + return isl::pw_multi_aff(*this).at(pos); } -isl::pw_aff pw_aff::pullback(isl::multi_aff ma) const +isl::set pw_aff::bind(const isl::multi_id &tuple) const { - auto res = isl_pw_aff_pullback_multi_aff(copy(), ma.release()); - return manage(res); + return isl::multi_pw_aff(*this).bind(tuple); } -isl::pw_aff pw_aff::pullback(isl::multi_pw_aff mpa) const +isl::set pw_aff::bind(isl::id id) const { - auto res = isl_pw_aff_pullback_multi_pw_aff(copy(), mpa.release()); + auto res = isl_pw_aff_bind_id(copy(), id.release()); return manage(res); } -isl::pw_aff pw_aff::pullback(isl::pw_multi_aff pma) const +isl::set pw_aff::bind(const std::string &id) const { - auto res = isl_pw_aff_pullback_pw_multi_aff(copy(), pma.release()); - return manage(res); + return this->bind(isl::id(ctx(), id)); } -isl::pw_aff pw_aff::reset_tuple_id(isl::dim type) const +isl::pw_aff pw_aff::bind_domain(isl::multi_id tuple) const { - auto res = isl_pw_aff_reset_tuple_id(copy(), static_cast(type)); + auto res = isl_pw_aff_bind_domain(copy(), tuple.release()); return manage(res); } -isl::pw_aff pw_aff::reset_user() const +isl::pw_aff pw_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const { - auto res = isl_pw_aff_reset_user(copy()); + auto res = isl_pw_aff_bind_domain_wrapped_domain(copy(), tuple.release()); return manage(res); } -isl::pw_aff pw_aff::scale(isl::val v) const +isl::pw_aff pw_aff::ceil() const { - auto res = isl_pw_aff_scale_val(copy(), v.release()); + auto res = isl_pw_aff_ceil(copy()); return manage(res); } -isl::pw_aff pw_aff::scale_down(isl::val f) const +isl::pw_aff pw_aff::coalesce() const { - auto res = isl_pw_aff_scale_down_val(copy(), f.release()); + auto res = isl_pw_aff_coalesce(copy()); return manage(res); } -isl::pw_aff pw_aff::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const +isl::pw_aff pw_aff::cond(isl::pw_aff pwaff_true, isl::pw_aff pwaff_false) const { - auto res = isl_pw_aff_set_dim_id(copy(), static_cast(type), pos, id.release()); + auto res = isl_pw_aff_cond(copy(), pwaff_true.release(), pwaff_false.release()); return manage(res); } -isl::pw_aff pw_aff::set_tuple_id(isl::dim type, isl::id id) const +class size pw_aff::dim(isl::dim type) const { - auto res = isl_pw_aff_set_tuple_id(copy(), static_cast(type), id.release()); - return manage(res); + return isl::pw_multi_aff(*this).dim(type); } -isl::pw_aff pw_aff::sub(isl::pw_aff pwaff2) const +isl::id pw_aff::dim_id(isl::dim type, unsigned int pos) const { - auto res = isl_pw_aff_sub(copy(), pwaff2.release()); + auto res = isl_pw_aff_get_dim_id(get(), static_cast(type), pos); return manage(res); } -isl::pw_aff pw_aff::subtract_domain(isl::set set) const +isl::id pw_aff::get_dim_id(isl::dim type, unsigned int pos) const { - auto res = isl_pw_aff_subtract_domain(copy(), set.release()); - return manage(res); + return dim_id(type, pos); } -isl::pw_aff pw_aff::tdiv_q(isl::pw_aff pa2) const +isl::pw_aff pw_aff::div(isl::pw_aff pa2) const { - auto res = isl_pw_aff_tdiv_q(copy(), pa2.release()); + auto res = isl_pw_aff_div(copy(), pa2.release()); return manage(res); } -isl::pw_aff pw_aff::tdiv_r(isl::pw_aff pa2) const +isl::set pw_aff::domain() const { - auto res = isl_pw_aff_tdiv_r(copy(), pa2.release()); + auto res = isl_pw_aff_domain(copy()); return manage(res); } -isl::pw_aff pw_aff::union_add(isl::pw_aff pwaff2) const +isl::space pw_aff::domain_space() const { - auto res = isl_pw_aff_union_add(copy(), pwaff2.release()); + auto res = isl_pw_aff_get_domain_space(get()); return manage(res); } -isl::pw_aff pw_aff::union_max(isl::pw_aff pwaff2) const +isl::space pw_aff::get_domain_space() const { - auto res = isl_pw_aff_union_max(copy(), pwaff2.release()); - return manage(res); + return domain_space(); } -isl::pw_aff pw_aff::union_min(isl::pw_aff pwaff2) const +isl::pw_multi_aff pw_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_pw_aff_union_min(copy(), pwaff2.release()); - return manage(res); + return isl::pw_multi_aff(*this).drop_dims(type, first, n); } -isl::pw_aff pw_aff::var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos) +isl::set pw_aff::eq_set(isl::pw_aff pwaff2) const { - auto res = isl_pw_aff_var_on_domain(ls.release(), static_cast(type), pos); + auto res = isl_pw_aff_eq_set(copy(), pwaff2.release()); return manage(res); } -isl::set pw_aff::zero_set() const +isl::val pw_aff::eval(isl::point pnt) const { - auto res = isl_pw_aff_zero_set(copy()); + auto res = isl_pw_aff_eval(copy(), pnt.release()); return manage(res); } -// implementations for isl::pw_aff_list -pw_aff_list manage(__isl_take isl_pw_aff_list *ptr) { - return pw_aff_list(ptr); -} -pw_aff_list manage_copy(__isl_keep isl_pw_aff_list *ptr) { - ptr = isl_pw_aff_list_copy(ptr); - return pw_aff_list(ptr); +isl::pw_multi_aff pw_aff::extract_pw_multi_aff(const isl::space &space) const +{ + return isl::union_pw_aff(*this).extract_pw_multi_aff(space); } -pw_aff_list::pw_aff_list() - : ptr(nullptr) {} - -pw_aff_list::pw_aff_list(const pw_aff_list &obj) - : ptr(nullptr) +isl::multi_pw_aff pw_aff::flat_range_product(const isl::multi_pw_aff &multi2) const { - ptr = obj.copy(); + return isl::pw_multi_aff(*this).flat_range_product(multi2); } - -pw_aff_list::pw_aff_list(__isl_take isl_pw_aff_list *ptr) - : ptr(ptr) {} - - -pw_aff_list &pw_aff_list::operator=(pw_aff_list obj) { - std::swap(this->ptr, obj.ptr); - return *this; +isl::multi_union_pw_aff pw_aff::flat_range_product(const isl::multi_union_pw_aff &multi2) const +{ + return isl::union_pw_aff(*this).flat_range_product(multi2); } -pw_aff_list::~pw_aff_list() { - if (ptr) - isl_pw_aff_list_free(ptr); +isl::pw_multi_aff pw_aff::flat_range_product(const isl::pw_multi_aff &pma2) const +{ + return isl::pw_multi_aff(*this).flat_range_product(pma2); } -__isl_give isl_pw_aff_list *pw_aff_list::copy() const & { - return isl_pw_aff_list_copy(ptr); +isl::union_pw_multi_aff pw_aff::flat_range_product(const isl::union_pw_multi_aff &upma2) const +{ + return isl::union_pw_aff(*this).flat_range_product(upma2); } -__isl_keep isl_pw_aff_list *pw_aff_list::get() const { - return ptr; +isl::pw_aff pw_aff::floor() const +{ + auto res = isl_pw_aff_floor(copy()); + return manage(res); } -__isl_give isl_pw_aff_list *pw_aff_list::release() { - isl_pw_aff_list *tmp = ptr; - ptr = nullptr; - return tmp; +stat pw_aff::foreach_piece(const std::function &fn) const +{ + struct fn_data { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_set *arg_0, isl_aff *arg_1, void *arg_2) -> isl_stat { + auto *data = static_cast(arg_2); + auto ret = (data->func)(manage(arg_0), manage(arg_1)); + return ret.release(); + }; + auto res = isl_pw_aff_foreach_piece(get(), fn_lambda, &fn_data); + return manage(res); } -bool pw_aff_list::is_null() const { - return ptr == nullptr; +stat pw_aff::foreach_piece(const std::function &fn) const +{ + return isl::pw_multi_aff(*this).foreach_piece(fn); } - -isl::ctx pw_aff_list::ctx() const { - return isl::ctx(isl_pw_aff_list_get_ctx(ptr)); +stat pw_aff::foreach_pw_aff(const std::function &fn) const +{ + return isl::union_pw_aff(*this).foreach_pw_aff(fn); } -void pw_aff_list::dump() const { - isl_pw_aff_list_dump(get()); +isl::set pw_aff::ge_set(isl::pw_aff pwaff2) const +{ + auto res = isl_pw_aff_ge_set(copy(), pwaff2.release()); + return manage(res); } - -isl::pw_aff_list pw_aff_list::add(isl::pw_aff el) const +isl::pw_aff pw_aff::gist(isl::set context) const { - auto res = isl_pw_aff_list_add(copy(), el.release()); + auto res = isl_pw_aff_gist(copy(), context.release()); return manage(res); } -isl::pw_aff_list pw_aff_list::alloc(isl::ctx ctx, int n) +isl::union_pw_aff pw_aff::gist(const isl::union_set &context) const { - auto res = isl_pw_aff_list_alloc(ctx.release(), n); - return manage(res); + return isl::union_pw_aff(*this).gist(context); } -isl::pw_aff_list pw_aff_list::clear() const +isl::pw_aff pw_aff::gist(const isl::basic_set &context) const { - auto res = isl_pw_aff_list_clear(copy()); - return manage(res); + return this->gist(isl::set(context)); } -isl::pw_aff_list pw_aff_list::concat(isl::pw_aff_list list2) const +isl::pw_aff pw_aff::gist(const isl::point &context) const { - auto res = isl_pw_aff_list_concat(copy(), list2.release()); + return this->gist(isl::set(context)); +} + +isl::set pw_aff::gt_set(isl::pw_aff pwaff2) const +{ + auto res = isl_pw_aff_gt_set(copy(), pwaff2.release()); return manage(res); } -isl::pw_aff_list pw_aff_list::drop(unsigned int first, unsigned int n) const +boolean pw_aff::has_range_tuple_id() const { - auto res = isl_pw_aff_list_drop(copy(), first, n); - return manage(res); + return isl::pw_multi_aff(*this).has_range_tuple_id(); } -isl::set pw_aff_list::eq_set(isl::pw_aff_list list2) const +isl::multi_pw_aff pw_aff::identity() const { - auto res = isl_pw_aff_list_eq_set(copy(), list2.release()); - return manage(res); + return isl::pw_multi_aff(*this).identity(); } -stat pw_aff_list::foreach(const std::function &fn) const +isl::pw_aff pw_aff::insert_domain(isl::space domain) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_pw_aff *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_pw_aff_list_foreach(get(), fn_lambda, &fn_data); + auto res = isl_pw_aff_insert_domain(copy(), domain.release()); return manage(res); } -isl::pw_aff_list pw_aff_list::from_pw_aff(isl::pw_aff el) +isl::pw_aff pw_aff::intersect_domain(isl::set set) const { - auto res = isl_pw_aff_list_from_pw_aff(el.release()); + auto res = isl_pw_aff_intersect_domain(copy(), set.release()); return manage(res); } -isl::set pw_aff_list::ge_set(isl::pw_aff_list list2) const +isl::union_pw_aff pw_aff::intersect_domain(const isl::space &space) const { - auto res = isl_pw_aff_list_ge_set(copy(), list2.release()); - return manage(res); + return isl::union_pw_aff(*this).intersect_domain(space); } -isl::pw_aff pw_aff_list::get_at(int index) const +isl::union_pw_aff pw_aff::intersect_domain(const isl::union_set &uset) const { - auto res = isl_pw_aff_list_get_at(get(), index); - return manage(res); + return isl::union_pw_aff(*this).intersect_domain(uset); } -isl::pw_aff pw_aff_list::get_pw_aff(int index) const +isl::pw_aff pw_aff::intersect_domain(const isl::basic_set &set) const { - auto res = isl_pw_aff_list_get_pw_aff(get(), index); - return manage(res); + return this->intersect_domain(isl::set(set)); } -isl::set pw_aff_list::gt_set(isl::pw_aff_list list2) const +isl::pw_aff pw_aff::intersect_domain(const isl::point &set) const { - auto res = isl_pw_aff_list_gt_set(copy(), list2.release()); - return manage(res); + return this->intersect_domain(isl::set(set)); } -isl::pw_aff_list pw_aff_list::insert(unsigned int pos, isl::pw_aff el) const +isl::union_pw_aff pw_aff::intersect_domain_wrapped_domain(const isl::union_set &uset) const { - auto res = isl_pw_aff_list_insert(copy(), pos, el.release()); - return manage(res); + return isl::union_pw_aff(*this).intersect_domain_wrapped_domain(uset); } -isl::set pw_aff_list::le_set(isl::pw_aff_list list2) const +isl::union_pw_aff pw_aff::intersect_domain_wrapped_range(const isl::union_set &uset) const { - auto res = isl_pw_aff_list_le_set(copy(), list2.release()); - return manage(res); + return isl::union_pw_aff(*this).intersect_domain_wrapped_range(uset); } -isl::set pw_aff_list::lt_set(isl::pw_aff_list list2) const +isl::pw_aff pw_aff::intersect_params(isl::set set) const { - auto res = isl_pw_aff_list_lt_set(copy(), list2.release()); + auto res = isl_pw_aff_intersect_params(copy(), set.release()); return manage(res); } -isl::pw_aff pw_aff_list::max() const +boolean pw_aff::involves_locals() const { - auto res = isl_pw_aff_list_max(copy()); - return manage(res); + return isl::pw_multi_aff(*this).involves_locals(); } -isl::pw_aff pw_aff_list::min() const +boolean pw_aff::involves_nan() const { - auto res = isl_pw_aff_list_min(copy()); - return manage(res); + return isl::multi_pw_aff(*this).involves_nan(); } -isl_size pw_aff_list::n_pw_aff() const +boolean pw_aff::involves_param(const isl::id &id) const { - auto res = isl_pw_aff_list_n_pw_aff(get()); - return res; + return isl::pw_multi_aff(*this).involves_param(id); } -isl::set pw_aff_list::ne_set(isl::pw_aff_list list2) const +boolean pw_aff::involves_param(const std::string &id) const { - auto res = isl_pw_aff_list_ne_set(copy(), list2.release()); - return manage(res); + return this->involves_param(isl::id(ctx(), id)); } -isl::pw_aff_list pw_aff_list::reverse() const +boolean pw_aff::involves_param(const isl::id_list &list) const { - auto res = isl_pw_aff_list_reverse(copy()); - return manage(res); + return isl::pw_multi_aff(*this).involves_param(list); } -isl::pw_aff_list pw_aff_list::set_pw_aff(int index, isl::pw_aff el) const +boolean pw_aff::is_cst() const { - auto res = isl_pw_aff_list_set_pw_aff(copy(), index, el.release()); + auto res = isl_pw_aff_is_cst(get()); return manage(res); } -isl_size pw_aff_list::size() const +boolean pw_aff::is_equal(const isl::pw_aff &pa2) const { - auto res = isl_pw_aff_list_size(get()); - return res; + auto res = isl_pw_aff_is_equal(get(), pa2.get()); + return manage(res); } -isl::pw_aff_list pw_aff_list::swap(unsigned int pos1, unsigned int pos2) const +boolean pw_aff::isa_aff() const { - auto res = isl_pw_aff_list_swap(copy(), pos1, pos2); + auto res = isl_pw_aff_isa_aff(get()); return manage(res); } -// implementations for isl::pw_multi_aff -pw_multi_aff manage(__isl_take isl_pw_multi_aff *ptr) { - return pw_multi_aff(ptr); -} -pw_multi_aff manage_copy(__isl_keep isl_pw_multi_aff *ptr) { - ptr = isl_pw_multi_aff_copy(ptr); - return pw_multi_aff(ptr); -} - -pw_multi_aff::pw_multi_aff() - : ptr(nullptr) {} - -pw_multi_aff::pw_multi_aff(const pw_multi_aff &obj) - : ptr(nullptr) +boolean pw_aff::isa_multi_aff() const { - ptr = obj.copy(); + return isl::pw_multi_aff(*this).isa_multi_aff(); } - -pw_multi_aff::pw_multi_aff(__isl_take isl_pw_multi_aff *ptr) - : ptr(ptr) {} - -pw_multi_aff::pw_multi_aff(isl::multi_aff ma) +boolean pw_aff::isa_pw_multi_aff() const { - auto res = isl_pw_multi_aff_from_multi_aff(ma.release()); - ptr = res; + return isl::union_pw_aff(*this).isa_pw_multi_aff(); } -pw_multi_aff::pw_multi_aff(isl::pw_aff pa) + +isl::set pw_aff::le_set(isl::pw_aff pwaff2) const { - auto res = isl_pw_multi_aff_from_pw_aff(pa.release()); - ptr = res; + auto res = isl_pw_aff_le_set(copy(), pwaff2.release()); + return manage(res); } -pw_multi_aff::pw_multi_aff(isl::ctx ctx, const std::string &str) + +isl::pw_aff_list pw_aff::list() const { - auto res = isl_pw_multi_aff_read_from_str(ctx.release(), str.c_str()); - ptr = res; + return isl::multi_pw_aff(*this).list(); } -pw_multi_aff &pw_multi_aff::operator=(pw_multi_aff obj) { - std::swap(this->ptr, obj.ptr); - return *this; +isl::set pw_aff::lt_set(isl::pw_aff pwaff2) const +{ + auto res = isl_pw_aff_lt_set(copy(), pwaff2.release()); + return manage(res); } -pw_multi_aff::~pw_multi_aff() { - if (ptr) - isl_pw_multi_aff_free(ptr); +isl::multi_pw_aff pw_aff::max(const isl::multi_pw_aff &multi2) const +{ + return isl::pw_multi_aff(*this).max(multi2); } -__isl_give isl_pw_multi_aff *pw_multi_aff::copy() const & { - return isl_pw_multi_aff_copy(ptr); +isl::pw_aff pw_aff::max(isl::pw_aff pwaff2) const +{ + auto res = isl_pw_aff_max(copy(), pwaff2.release()); + return manage(res); } -__isl_keep isl_pw_multi_aff *pw_multi_aff::get() const { - return ptr; +isl::pw_aff pw_aff::max(const isl::aff &pwaff2) const +{ + return this->max(isl::pw_aff(pwaff2)); } -__isl_give isl_pw_multi_aff *pw_multi_aff::release() { - isl_pw_multi_aff *tmp = ptr; - ptr = nullptr; - return tmp; +isl::multi_val pw_aff::max_multi_val() const +{ + return isl::pw_multi_aff(*this).max_multi_val(); } -bool pw_multi_aff::is_null() const { - return ptr == nullptr; +isl::multi_pw_aff pw_aff::min(const isl::multi_pw_aff &multi2) const +{ + return isl::pw_multi_aff(*this).min(multi2); } - -isl::ctx pw_multi_aff::ctx() const { - return isl::ctx(isl_pw_multi_aff_get_ctx(ptr)); +isl::pw_aff pw_aff::min(isl::pw_aff pwaff2) const +{ + auto res = isl_pw_aff_min(copy(), pwaff2.release()); + return manage(res); } -void pw_multi_aff::dump() const { - isl_pw_multi_aff_dump(get()); +isl::pw_aff pw_aff::min(const isl::aff &pwaff2) const +{ + return this->min(isl::pw_aff(pwaff2)); } - -isl::pw_multi_aff pw_multi_aff::add(isl::pw_multi_aff pma2) const +isl::multi_val pw_aff::min_multi_val() const { - auto res = isl_pw_multi_aff_add(copy(), pma2.release()); - return manage(res); + return isl::pw_multi_aff(*this).min_multi_val(); } -isl::pw_multi_aff pw_multi_aff::add_constant(isl::multi_val mv) const +isl::pw_aff pw_aff::mod(isl::val mod) const { - auto res = isl_pw_multi_aff_add_constant_multi_val(copy(), mv.release()); + auto res = isl_pw_aff_mod_val(copy(), mod.release()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::add_constant(isl::val v) const +isl::pw_aff pw_aff::mod(long mod) const { - auto res = isl_pw_multi_aff_add_constant_val(copy(), v.release()); - return manage(res); + return this->mod(isl::val(ctx(), mod)); } -isl::pw_multi_aff pw_multi_aff::align_params(isl::space model) const +isl::pw_aff pw_aff::mul(isl::pw_aff pwaff2) const { - auto res = isl_pw_multi_aff_align_params(copy(), model.release()); + auto res = isl_pw_aff_mul(copy(), pwaff2.release()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::alloc(isl::set set, isl::multi_aff maff) +class size pw_aff::n_piece() const { - auto res = isl_pw_multi_aff_alloc(set.release(), maff.release()); - return manage(res); + return isl::pw_multi_aff(*this).n_piece(); } -isl::multi_aff pw_multi_aff::as_multi_aff() const +isl::set pw_aff::ne_set(isl::pw_aff pwaff2) const { - auto res = isl_pw_multi_aff_as_multi_aff(copy()); + auto res = isl_pw_aff_ne_set(copy(), pwaff2.release()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::bind_domain(isl::multi_id tuple) const +isl::pw_aff pw_aff::neg() const { - auto res = isl_pw_multi_aff_bind_domain(copy(), tuple.release()); + auto res = isl_pw_aff_neg(copy()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const +isl::pw_aff pw_aff::param_on_domain(isl::set domain, isl::id id) { - auto res = isl_pw_multi_aff_bind_domain_wrapped_domain(copy(), tuple.release()); + auto res = isl_pw_aff_param_on_domain_id(domain.release(), id.release()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::coalesce() const +boolean pw_aff::plain_is_empty() const { - auto res = isl_pw_multi_aff_coalesce(copy()); - return manage(res); + return isl::union_pw_aff(*this).plain_is_empty(); } -isl_size pw_multi_aff::dim(isl::dim type) const +boolean pw_aff::plain_is_equal(const isl::multi_pw_aff &multi2) const { - auto res = isl_pw_multi_aff_dim(get(), static_cast(type)); - return res; + return isl::pw_multi_aff(*this).plain_is_equal(multi2); } -isl::set pw_multi_aff::domain() const +boolean pw_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_pw_multi_aff_domain(copy()); - return manage(res); + return isl::union_pw_aff(*this).plain_is_equal(multi2); } -isl::pw_multi_aff pw_multi_aff::domain_map(isl::space space) +isl::pw_multi_aff pw_aff::preimage_domain_wrapped_domain(const isl::pw_multi_aff &pma2) const { - auto res = isl_pw_multi_aff_domain_map(space.release()); - return manage(res); + return isl::pw_multi_aff(*this).preimage_domain_wrapped_domain(pma2); } -isl::pw_multi_aff pw_multi_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::union_pw_multi_aff pw_aff::preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_pw_multi_aff_drop_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::union_pw_aff(*this).preimage_domain_wrapped_domain(upma2); } -isl::pw_multi_aff pw_multi_aff::drop_unused_params() const +isl::multi_pw_aff pw_aff::product(const isl::multi_pw_aff &multi2) const { - auto res = isl_pw_multi_aff_drop_unused_params(copy()); - return manage(res); + return isl::pw_multi_aff(*this).product(multi2); } -isl::pw_multi_aff pw_multi_aff::empty(isl::space space) +isl::pw_multi_aff pw_aff::product(const isl::pw_multi_aff &pma2) const { - auto res = isl_pw_multi_aff_empty(space.release()); - return manage(res); + return isl::pw_multi_aff(*this).product(pma2); } -int pw_multi_aff::find_dim_by_name(isl::dim type, const std::string &name) const +isl::pw_aff pw_aff::pullback(isl::multi_aff ma) const { - auto res = isl_pw_multi_aff_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + auto res = isl_pw_aff_pullback_multi_aff(copy(), ma.release()); + return manage(res); } -isl::pw_multi_aff pw_multi_aff::fix_si(isl::dim type, unsigned int pos, int value) const +isl::pw_aff pw_aff::pullback(isl::multi_pw_aff mpa) const { - auto res = isl_pw_multi_aff_fix_si(copy(), static_cast(type), pos, value); + auto res = isl_pw_aff_pullback_multi_pw_aff(copy(), mpa.release()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::flat_range_product(isl::pw_multi_aff pma2) const +isl::pw_aff pw_aff::pullback(isl::pw_multi_aff pma) const { - auto res = isl_pw_multi_aff_flat_range_product(copy(), pma2.release()); + auto res = isl_pw_aff_pullback_pw_multi_aff(copy(), pma.release()); return manage(res); } -stat pw_multi_aff::foreach_piece(const std::function &fn) const +isl::union_pw_aff pw_aff::pullback(const isl::union_pw_multi_aff &upma) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_set *arg_0, isl_multi_aff *arg_1, void *arg_2) -> isl_stat { - auto *data = static_cast(arg_2); - stat ret = (*data->func)(manage(arg_0), manage(arg_1)); - return ret.release(); - }; - auto res = isl_pw_multi_aff_foreach_piece(get(), fn_lambda, &fn_data); - return manage(res); + return isl::union_pw_aff(*this).pullback(upma); } -isl::pw_multi_aff pw_multi_aff::from_domain(isl::set set) +isl::pw_multi_aff_list pw_aff::pw_multi_aff_list() const { - auto res = isl_pw_multi_aff_from_domain(set.release()); - return manage(res); + return isl::union_pw_aff(*this).pw_multi_aff_list(); } -isl::pw_multi_aff pw_multi_aff::from_map(isl::map map) +isl::pw_multi_aff pw_aff::range_factor_domain() const { - auto res = isl_pw_multi_aff_from_map(map.release()); - return manage(res); + return isl::pw_multi_aff(*this).range_factor_domain(); } -isl::pw_multi_aff pw_multi_aff::from_multi_pw_aff(isl::multi_pw_aff mpa) +isl::pw_multi_aff pw_aff::range_factor_range() const { - auto res = isl_pw_multi_aff_from_multi_pw_aff(mpa.release()); - return manage(res); + return isl::pw_multi_aff(*this).range_factor_range(); } -isl::pw_multi_aff pw_multi_aff::from_set(isl::set set) +isl::multi_pw_aff pw_aff::range_product(const isl::multi_pw_aff &multi2) const { - auto res = isl_pw_multi_aff_from_set(set.release()); - return manage(res); + return isl::pw_multi_aff(*this).range_product(multi2); } -isl::id pw_multi_aff::get_dim_id(isl::dim type, unsigned int pos) const +isl::multi_union_pw_aff pw_aff::range_product(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_pw_multi_aff_get_dim_id(get(), static_cast(type), pos); - return manage(res); + return isl::union_pw_aff(*this).range_product(multi2); } -std::string pw_multi_aff::get_dim_name(isl::dim type, unsigned int pos) const +isl::pw_multi_aff pw_aff::range_product(const isl::pw_multi_aff &pma2) const { - auto res = isl_pw_multi_aff_get_dim_name(get(), static_cast(type), pos); - std::string tmp(res); - return tmp; + return isl::pw_multi_aff(*this).range_product(pma2); } -isl::space pw_multi_aff::get_domain_space() const +isl::union_pw_multi_aff pw_aff::range_product(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_pw_multi_aff_get_domain_space(get()); - return manage(res); + return isl::union_pw_aff(*this).range_product(upma2); } -isl::pw_aff pw_multi_aff::get_pw_aff(int pos) const +isl::id pw_aff::range_tuple_id() const { - auto res = isl_pw_multi_aff_get_pw_aff(get(), pos); - return manage(res); + return isl::pw_multi_aff(*this).range_tuple_id(); } -isl::space pw_multi_aff::get_space() const +isl::multi_pw_aff pw_aff::reset_range_tuple_id() const { - auto res = isl_pw_multi_aff_get_space(get()); - return manage(res); + return isl::multi_pw_aff(*this).reset_range_tuple_id(); } -isl::id pw_multi_aff::get_tuple_id(isl::dim type) const +isl::multi_pw_aff pw_aff::reset_tuple_id(isl::dim type) const { - auto res = isl_pw_multi_aff_get_tuple_id(get(), static_cast(type)); - return manage(res); + return isl::multi_pw_aff(*this).reset_tuple_id(type); } -std::string pw_multi_aff::get_tuple_name(isl::dim type) const +isl::multi_pw_aff pw_aff::scale(const isl::multi_val &mv) const { - auto res = isl_pw_multi_aff_get_tuple_name(get(), static_cast(type)); - std::string tmp(res); - return tmp; + return isl::multi_pw_aff(*this).scale(mv); } -isl::pw_multi_aff pw_multi_aff::gist(isl::set set) const +isl::pw_aff pw_aff::scale(isl::val v) const { - auto res = isl_pw_multi_aff_gist(copy(), set.release()); + auto res = isl_pw_aff_scale_val(copy(), v.release()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::gist_params(isl::set set) const +isl::pw_aff pw_aff::scale(long v) const { - auto res = isl_pw_multi_aff_gist_params(copy(), set.release()); - return manage(res); + return this->scale(isl::val(ctx(), v)); } -boolean pw_multi_aff::has_tuple_id(isl::dim type) const +isl::multi_pw_aff pw_aff::scale_down(const isl::multi_val &mv) const { - auto res = isl_pw_multi_aff_has_tuple_id(get(), static_cast(type)); - return manage(res); + return isl::multi_pw_aff(*this).scale_down(mv); } -boolean pw_multi_aff::has_tuple_name(isl::dim type) const +isl::pw_aff pw_aff::scale_down(isl::val f) const { - auto res = isl_pw_multi_aff_has_tuple_name(get(), static_cast(type)); + auto res = isl_pw_aff_scale_down_val(copy(), f.release()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::identity(isl::space space) +isl::pw_aff pw_aff::scale_down(long f) const { - auto res = isl_pw_multi_aff_identity(space.release()); - return manage(res); + return this->scale_down(isl::val(ctx(), f)); } -isl::pw_multi_aff pw_multi_aff::identity_on_domain(isl::space space) +isl::multi_pw_aff pw_aff::set_at(int pos, const isl::pw_aff &el) const { - auto res = isl_pw_multi_aff_identity_on_domain_space(space.release()); - return manage(res); + return isl::pw_multi_aff(*this).set_at(pos, el); } -isl::pw_multi_aff pw_multi_aff::insert_domain(isl::space domain) const +isl::multi_union_pw_aff pw_aff::set_at(int pos, const isl::union_pw_aff &el) const { - auto res = isl_pw_multi_aff_insert_domain(copy(), domain.release()); - return manage(res); + return isl::union_pw_aff(*this).set_at(pos, el); } -isl::pw_multi_aff pw_multi_aff::intersect_domain(isl::set set) const +isl::multi_pw_aff pw_aff::set_pw_aff(int pos, const isl::pw_aff &el) const { - auto res = isl_pw_multi_aff_intersect_domain(copy(), set.release()); - return manage(res); + return isl::pw_multi_aff(*this).set_pw_aff(pos, el); } -isl::pw_multi_aff pw_multi_aff::intersect_domain_wrapped_domain(isl::set set) const +isl::pw_multi_aff pw_aff::set_pw_aff(unsigned int pos, const isl::pw_aff &pa) const { - auto res = isl_pw_multi_aff_intersect_domain_wrapped_domain(copy(), set.release()); - return manage(res); + return isl::pw_multi_aff(*this).set_pw_aff(pos, pa); } -isl::pw_multi_aff pw_multi_aff::intersect_domain_wrapped_range(isl::set set) const +isl::pw_multi_aff pw_aff::set_range_tuple(const isl::id &id) const { - auto res = isl_pw_multi_aff_intersect_domain_wrapped_range(copy(), set.release()); - return manage(res); + return isl::pw_multi_aff(*this).set_range_tuple(id); } -isl::pw_multi_aff pw_multi_aff::intersect_params(isl::set set) const +isl::pw_multi_aff pw_aff::set_range_tuple(const std::string &id) const { - auto res = isl_pw_multi_aff_intersect_params(copy(), set.release()); - return manage(res); + return this->set_range_tuple(isl::id(ctx(), id)); } -boolean pw_multi_aff::involves_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::pw_aff pw_aff::set_tuple_id(isl::dim type, isl::id id) const { - auto res = isl_pw_multi_aff_involves_dims(get(), static_cast(type), first, n); + auto res = isl_pw_aff_set_tuple_id(copy(), static_cast(type), id.release()); return manage(res); } -boolean pw_multi_aff::involves_locals() const +isl::pw_aff pw_aff::set_tuple_id(isl::dim type, const std::string &id) const { - auto res = isl_pw_multi_aff_involves_locals(get()); - return manage(res); + return this->set_tuple_id(type, isl::id(ctx(), id)); } -boolean pw_multi_aff::involves_nan() const +isl::multi_union_pw_aff pw_aff::set_union_pw_aff(int pos, const isl::union_pw_aff &el) const { - auto res = isl_pw_multi_aff_involves_nan(get()); - return manage(res); + return isl::union_pw_aff(*this).set_union_pw_aff(pos, el); } -boolean pw_multi_aff::involves_param_id(const isl::id &id) const +class size pw_aff::size() const { - auto res = isl_pw_multi_aff_involves_param_id(get(), id.get()); - return manage(res); + return isl::multi_pw_aff(*this).size(); } -boolean pw_multi_aff::is_equal(const isl::pw_multi_aff &pma2) const +isl::space pw_aff::space() const { - auto res = isl_pw_multi_aff_is_equal(get(), pma2.get()); + auto res = isl_pw_aff_get_space(get()); return manage(res); } -boolean pw_multi_aff::isa_multi_aff() const +isl::space pw_aff::get_space() const { - auto res = isl_pw_multi_aff_isa_multi_aff(get()); - return manage(res); + return space(); } -isl::multi_val pw_multi_aff::max_multi_val() const +isl::multi_pw_aff pw_aff::sub(const isl::multi_pw_aff &multi2) const { - auto res = isl_pw_multi_aff_max_multi_val(copy()); - return manage(res); + return isl::pw_multi_aff(*this).sub(multi2); } -isl::multi_val pw_multi_aff::min_multi_val() const +isl::multi_union_pw_aff pw_aff::sub(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_pw_multi_aff_min_multi_val(copy()); - return manage(res); + return isl::union_pw_aff(*this).sub(multi2); } -isl::pw_multi_aff pw_multi_aff::multi_val_on_domain(isl::set domain, isl::multi_val mv) +isl::pw_aff pw_aff::sub(isl::pw_aff pwaff2) const { - auto res = isl_pw_multi_aff_multi_val_on_domain(domain.release(), mv.release()); + auto res = isl_pw_aff_sub(copy(), pwaff2.release()); return manage(res); } -isl_size pw_multi_aff::n_piece() const +isl::pw_multi_aff pw_aff::sub(const isl::pw_multi_aff &pma2) const { - auto res = isl_pw_multi_aff_n_piece(get()); - return res; + return isl::pw_multi_aff(*this).sub(pma2); } -isl::pw_multi_aff pw_multi_aff::neg() const +isl::union_pw_aff pw_aff::sub(const isl::union_pw_aff &upa2) const { - auto res = isl_pw_multi_aff_neg(copy()); - return manage(res); + return isl::union_pw_aff(*this).sub(upa2); } -boolean pw_multi_aff::plain_is_equal(const isl::pw_multi_aff &pma2) const +isl::union_pw_multi_aff pw_aff::sub(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_pw_multi_aff_plain_is_equal(get(), pma2.get()); - return manage(res); + return isl::union_pw_aff(*this).sub(upma2); } -isl::pw_multi_aff pw_multi_aff::preimage_domain_wrapped_domain(isl::pw_multi_aff pma2) const +isl::pw_aff pw_aff::sub(const isl::aff &pwaff2) const { - auto res = isl_pw_multi_aff_preimage_domain_wrapped_domain_pw_multi_aff(copy(), pma2.release()); - return manage(res); + return this->sub(isl::pw_aff(pwaff2)); } -isl::pw_multi_aff pw_multi_aff::product(isl::pw_multi_aff pma2) const +isl::pw_aff pw_aff::subtract_domain(isl::set set) const { - auto res = isl_pw_multi_aff_product(copy(), pma2.release()); + auto res = isl_pw_aff_subtract_domain(copy(), set.release()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::project_domain_on_params() const +isl::union_pw_aff pw_aff::subtract_domain(const isl::space &space) const { - auto res = isl_pw_multi_aff_project_domain_on_params(copy()); - return manage(res); + return isl::union_pw_aff(*this).subtract_domain(space); } -isl::pw_multi_aff pw_multi_aff::project_out_map(isl::space space, isl::dim type, unsigned int first, unsigned int n) +isl::union_pw_aff pw_aff::subtract_domain(const isl::union_set &uset) const { - auto res = isl_pw_multi_aff_project_out_map(space.release(), static_cast(type), first, n); - return manage(res); + return isl::union_pw_aff(*this).subtract_domain(uset); } -isl::pw_multi_aff pw_multi_aff::pullback(isl::multi_aff ma) const +isl::pw_aff pw_aff::subtract_domain(const isl::basic_set &set) const { - auto res = isl_pw_multi_aff_pullback_multi_aff(copy(), ma.release()); - return manage(res); + return this->subtract_domain(isl::set(set)); } -isl::pw_multi_aff pw_multi_aff::pullback(isl::pw_multi_aff pma2) const +isl::pw_aff pw_aff::subtract_domain(const isl::point &set) const { - auto res = isl_pw_multi_aff_pullback_pw_multi_aff(copy(), pma2.release()); - return manage(res); + return this->subtract_domain(isl::set(set)); } -isl::pw_multi_aff pw_multi_aff::range_factor_domain() const +isl::pw_aff pw_aff::tdiv_q(isl::pw_aff pa2) const { - auto res = isl_pw_multi_aff_range_factor_domain(copy()); + auto res = isl_pw_aff_tdiv_q(copy(), pa2.release()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::range_factor_range() const +isl::pw_aff pw_aff::tdiv_r(isl::pw_aff pa2) const { - auto res = isl_pw_multi_aff_range_factor_range(copy()); + auto res = isl_pw_aff_tdiv_r(copy(), pa2.release()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::range_map(isl::space space) +isl::pw_aff_list pw_aff::to_list() const { - auto res = isl_pw_multi_aff_range_map(space.release()); + auto res = isl_pw_aff_to_list(copy()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::range_product(isl::pw_multi_aff pma2) const +isl::multi_pw_aff pw_aff::to_multi_pw_aff() const { - auto res = isl_pw_multi_aff_range_product(copy(), pma2.release()); - return manage(res); + return isl::pw_multi_aff(*this).to_multi_pw_aff(); } -isl::pw_multi_aff pw_multi_aff::reset_tuple_id(isl::dim type) const +isl::union_pw_aff pw_aff::to_union_pw_aff() const { - auto res = isl_pw_multi_aff_reset_tuple_id(copy(), static_cast(type)); + auto res = isl_pw_aff_to_union_pw_aff(copy()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::reset_user() const +isl::union_pw_multi_aff pw_aff::to_union_pw_multi_aff() const { - auto res = isl_pw_multi_aff_reset_user(copy()); - return manage(res); + return isl::pw_multi_aff(*this).to_union_pw_multi_aff(); } -isl::pw_multi_aff pw_multi_aff::scale(isl::val v) const +isl::id pw_aff::tuple_id(isl::dim type) const { - auto res = isl_pw_multi_aff_scale_val(copy(), v.release()); + auto res = isl_pw_aff_get_tuple_id(get(), static_cast(type)); return manage(res); } -isl::pw_multi_aff pw_multi_aff::scale_down(isl::val v) const +isl::id pw_aff::get_tuple_id(isl::dim type) const { - auto res = isl_pw_multi_aff_scale_down_val(copy(), v.release()); - return manage(res); + return tuple_id(type); } -isl::pw_multi_aff pw_multi_aff::scale_multi_val(isl::multi_val mv) const +isl::multi_pw_aff pw_aff::unbind_params_insert_domain(const isl::multi_id &domain) const { - auto res = isl_pw_multi_aff_scale_multi_val(copy(), mv.release()); - return manage(res); + return isl::pw_multi_aff(*this).unbind_params_insert_domain(domain); } -isl::pw_multi_aff pw_multi_aff::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const +isl::multi_pw_aff pw_aff::union_add(const isl::multi_pw_aff &mpa2) const { - auto res = isl_pw_multi_aff_set_dim_id(copy(), static_cast(type), pos, id.release()); - return manage(res); + return isl::pw_multi_aff(*this).union_add(mpa2); } -isl::pw_multi_aff pw_multi_aff::set_pw_aff(unsigned int pos, isl::pw_aff pa) const +isl::multi_union_pw_aff pw_aff::union_add(const isl::multi_union_pw_aff &mupa2) const { - auto res = isl_pw_multi_aff_set_pw_aff(copy(), pos, pa.release()); - return manage(res); + return isl::union_pw_aff(*this).union_add(mupa2); } -isl::pw_multi_aff pw_multi_aff::set_tuple_id(isl::dim type, isl::id id) const +isl::pw_aff pw_aff::union_add(isl::pw_aff pwaff2) const { - auto res = isl_pw_multi_aff_set_tuple_id(copy(), static_cast(type), id.release()); + auto res = isl_pw_aff_union_add(copy(), pwaff2.release()); return manage(res); } -isl::pw_multi_aff pw_multi_aff::sub(isl::pw_multi_aff pma2) const +isl::pw_multi_aff pw_aff::union_add(const isl::pw_multi_aff &pma2) const { - auto res = isl_pw_multi_aff_sub(copy(), pma2.release()); - return manage(res); + return isl::pw_multi_aff(*this).union_add(pma2); } -isl::pw_multi_aff pw_multi_aff::subtract_domain(isl::set set) const +isl::union_pw_aff pw_aff::union_add(const isl::union_pw_aff &upa2) const { - auto res = isl_pw_multi_aff_subtract_domain(copy(), set.release()); - return manage(res); + return isl::union_pw_aff(*this).union_add(upa2); } -isl::pw_multi_aff pw_multi_aff::union_add(isl::pw_multi_aff pma2) const +isl::union_pw_multi_aff pw_aff::union_add(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_pw_multi_aff_union_add(copy(), pma2.release()); - return manage(res); + return isl::union_pw_aff(*this).union_add(upma2); } -isl::pw_multi_aff pw_multi_aff::union_lexmax(isl::pw_multi_aff pma2) const +isl::pw_aff pw_aff::union_add(const isl::aff &pwaff2) const { - auto res = isl_pw_multi_aff_union_lexmax(copy(), pma2.release()); - return manage(res); + return this->union_add(isl::pw_aff(pwaff2)); } -isl::pw_multi_aff pw_multi_aff::union_lexmin(isl::pw_multi_aff pma2) const +isl::pw_aff pw_aff::var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos) { - auto res = isl_pw_multi_aff_union_lexmin(copy(), pma2.release()); + auto res = isl_pw_aff_var_on_domain(ls.release(), static_cast(type), pos); return manage(res); } -isl::pw_multi_aff pw_multi_aff::zero(isl::space space) +inline std::ostream &operator<<(std::ostream &os, const pw_aff &obj) { - auto res = isl_pw_multi_aff_zero(space.release()); - return manage(res); + char *str = isl_pw_aff_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -// implementations for isl::pw_multi_aff_list -pw_multi_aff_list manage(__isl_take isl_pw_multi_aff_list *ptr) { - return pw_multi_aff_list(ptr); +// implementations for isl::pw_aff_list +pw_aff_list manage(__isl_take isl_pw_aff_list *ptr) { + return pw_aff_list(ptr); } -pw_multi_aff_list manage_copy(__isl_keep isl_pw_multi_aff_list *ptr) { - ptr = isl_pw_multi_aff_list_copy(ptr); - return pw_multi_aff_list(ptr); +pw_aff_list manage_copy(__isl_keep isl_pw_aff_list *ptr) { + ptr = isl_pw_aff_list_copy(ptr); + return pw_aff_list(ptr); } -pw_multi_aff_list::pw_multi_aff_list() +pw_aff_list::pw_aff_list() : ptr(nullptr) {} -pw_multi_aff_list::pw_multi_aff_list(const pw_multi_aff_list &obj) +pw_aff_list::pw_aff_list(const pw_aff_list &obj) : ptr(nullptr) { ptr = obj.copy(); } - -pw_multi_aff_list::pw_multi_aff_list(__isl_take isl_pw_multi_aff_list *ptr) +pw_aff_list::pw_aff_list(__isl_take isl_pw_aff_list *ptr) : ptr(ptr) {} +pw_aff_list::pw_aff_list(isl::ctx ctx, int n) +{ + auto res = isl_pw_aff_list_alloc(ctx.release(), n); + ptr = res; +} + +pw_aff_list::pw_aff_list(isl::pw_aff el) +{ + auto res = isl_pw_aff_list_from_pw_aff(el.release()); + ptr = res; +} + +pw_aff_list::pw_aff_list(isl::ctx ctx, const std::string &str) +{ + auto res = isl_pw_aff_list_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} -pw_multi_aff_list &pw_multi_aff_list::operator=(pw_multi_aff_list obj) { +pw_aff_list &pw_aff_list::operator=(pw_aff_list obj) { std::swap(this->ptr, obj.ptr); return *this; } -pw_multi_aff_list::~pw_multi_aff_list() { +pw_aff_list::~pw_aff_list() { if (ptr) - isl_pw_multi_aff_list_free(ptr); + isl_pw_aff_list_free(ptr); } -__isl_give isl_pw_multi_aff_list *pw_multi_aff_list::copy() const & { - return isl_pw_multi_aff_list_copy(ptr); +__isl_give isl_pw_aff_list *pw_aff_list::copy() const & { + return isl_pw_aff_list_copy(ptr); } -__isl_keep isl_pw_multi_aff_list *pw_multi_aff_list::get() const { +__isl_keep isl_pw_aff_list *pw_aff_list::get() const { return ptr; } -__isl_give isl_pw_multi_aff_list *pw_multi_aff_list::release() { - isl_pw_multi_aff_list *tmp = ptr; +__isl_give isl_pw_aff_list *pw_aff_list::release() { + isl_pw_aff_list *tmp = ptr; ptr = nullptr; return tmp; } -bool pw_multi_aff_list::is_null() const { +bool pw_aff_list::is_null() const { return ptr == nullptr; } - -isl::ctx pw_multi_aff_list::ctx() const { - return isl::ctx(isl_pw_multi_aff_list_get_ctx(ptr)); +isl::ctx pw_aff_list::ctx() const { + return isl::ctx(isl_pw_aff_list_get_ctx(ptr)); } -void pw_multi_aff_list::dump() const { - isl_pw_multi_aff_list_dump(get()); +isl::pw_aff_list pw_aff_list::add(isl::pw_aff el) const +{ + auto res = isl_pw_aff_list_add(copy(), el.release()); + return manage(res); } - -isl::pw_multi_aff_list pw_multi_aff_list::add(isl::pw_multi_aff el) const +isl::pw_aff pw_aff_list::at(int index) const { - auto res = isl_pw_multi_aff_list_add(copy(), el.release()); + auto res = isl_pw_aff_list_get_at(get(), index); return manage(res); } -isl::pw_multi_aff_list pw_multi_aff_list::alloc(isl::ctx ctx, int n) +isl::pw_aff pw_aff_list::get_at(int index) const { - auto res = isl_pw_multi_aff_list_alloc(ctx.release(), n); - return manage(res); + return at(index); } -isl::pw_multi_aff_list pw_multi_aff_list::clear() const +isl::pw_aff_list pw_aff_list::clear() const { - auto res = isl_pw_multi_aff_list_clear(copy()); + auto res = isl_pw_aff_list_clear(copy()); return manage(res); } -isl::pw_multi_aff_list pw_multi_aff_list::concat(isl::pw_multi_aff_list list2) const +isl::pw_aff_list pw_aff_list::concat(isl::pw_aff_list list2) const { - auto res = isl_pw_multi_aff_list_concat(copy(), list2.release()); + auto res = isl_pw_aff_list_concat(copy(), list2.release()); return manage(res); } -isl::pw_multi_aff_list pw_multi_aff_list::drop(unsigned int first, unsigned int n) const +isl::pw_aff_list pw_aff_list::drop(unsigned int first, unsigned int n) const { - auto res = isl_pw_multi_aff_list_drop(copy(), first, n); + auto res = isl_pw_aff_list_drop(copy(), first, n); return manage(res); } -stat pw_multi_aff_list::foreach(const std::function &fn) const +stat pw_aff_list::foreach(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_pw_multi_aff *arg_0, void *arg_1) -> isl_stat { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_pw_aff *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); + auto ret = (data->func)(manage(arg_0)); return ret.release(); }; - auto res = isl_pw_multi_aff_list_foreach(get(), fn_lambda, &fn_data); + auto res = isl_pw_aff_list_foreach(get(), fn_lambda, &fn_data); return manage(res); } -isl::pw_multi_aff_list pw_multi_aff_list::from_pw_multi_aff(isl::pw_multi_aff el) +isl::pw_aff_list pw_aff_list::insert(unsigned int pos, isl::pw_aff el) const { - auto res = isl_pw_multi_aff_list_from_pw_multi_aff(el.release()); + auto res = isl_pw_aff_list_insert(copy(), pos, el.release()); return manage(res); } -isl::pw_multi_aff pw_multi_aff_list::get_at(int index) const +class size pw_aff_list::size() const { - auto res = isl_pw_multi_aff_list_get_at(get(), index); + auto res = isl_pw_aff_list_size(get()); return manage(res); } -isl::pw_multi_aff pw_multi_aff_list::get_pw_multi_aff(int index) const +inline std::ostream &operator<<(std::ostream &os, const pw_aff_list &obj) { - auto res = isl_pw_multi_aff_list_get_pw_multi_aff(get(), index); - return manage(res); + char *str = isl_pw_aff_list_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::pw_multi_aff_list pw_multi_aff_list::insert(unsigned int pos, isl::pw_multi_aff el) const -{ - auto res = isl_pw_multi_aff_list_insert(copy(), pos, el.release()); - return manage(res); +// implementations for isl::pw_multi_aff +pw_multi_aff manage(__isl_take isl_pw_multi_aff *ptr) { + return pw_multi_aff(ptr); } - -isl_size pw_multi_aff_list::n_pw_multi_aff() const -{ - auto res = isl_pw_multi_aff_list_n_pw_multi_aff(get()); - return res; +pw_multi_aff manage_copy(__isl_keep isl_pw_multi_aff *ptr) { + ptr = isl_pw_multi_aff_copy(ptr); + return pw_multi_aff(ptr); } -isl::pw_multi_aff_list pw_multi_aff_list::reverse() const -{ - auto res = isl_pw_multi_aff_list_reverse(copy()); - return manage(res); -} +pw_multi_aff::pw_multi_aff() + : ptr(nullptr) {} -isl::pw_multi_aff_list pw_multi_aff_list::set_pw_multi_aff(int index, isl::pw_multi_aff el) const +pw_multi_aff::pw_multi_aff(const pw_multi_aff &obj) + : ptr(nullptr) { - auto res = isl_pw_multi_aff_list_set_pw_multi_aff(copy(), index, el.release()); - return manage(res); + ptr = obj.copy(); } -isl_size pw_multi_aff_list::size() const -{ - auto res = isl_pw_multi_aff_list_size(get()); - return res; -} +pw_multi_aff::pw_multi_aff(__isl_take isl_pw_multi_aff *ptr) + : ptr(ptr) {} -isl::pw_multi_aff_list pw_multi_aff_list::swap(unsigned int pos1, unsigned int pos2) const +pw_multi_aff::pw_multi_aff(isl::multi_aff ma) { - auto res = isl_pw_multi_aff_list_swap(copy(), pos1, pos2); - return manage(res); -} - -// implementations for isl::pw_qpolynomial -pw_qpolynomial manage(__isl_take isl_pw_qpolynomial *ptr) { - return pw_qpolynomial(ptr); -} -pw_qpolynomial manage_copy(__isl_keep isl_pw_qpolynomial *ptr) { - ptr = isl_pw_qpolynomial_copy(ptr); - return pw_qpolynomial(ptr); + auto res = isl_pw_multi_aff_from_multi_aff(ma.release()); + ptr = res; } -pw_qpolynomial::pw_qpolynomial() - : ptr(nullptr) {} - -pw_qpolynomial::pw_qpolynomial(const pw_qpolynomial &obj) - : ptr(nullptr) +pw_multi_aff::pw_multi_aff(isl::pw_aff pa) { - ptr = obj.copy(); + auto res = isl_pw_multi_aff_from_pw_aff(pa.release()); + ptr = res; } - -pw_qpolynomial::pw_qpolynomial(__isl_take isl_pw_qpolynomial *ptr) - : ptr(ptr) {} - -pw_qpolynomial::pw_qpolynomial(isl::ctx ctx, const std::string &str) +pw_multi_aff::pw_multi_aff(isl::ctx ctx, const std::string &str) { - auto res = isl_pw_qpolynomial_read_from_str(ctx.release(), str.c_str()); + auto res = isl_pw_multi_aff_read_from_str(ctx.release(), str.c_str()); ptr = res; } -pw_qpolynomial &pw_qpolynomial::operator=(pw_qpolynomial obj) { +pw_multi_aff &pw_multi_aff::operator=(pw_multi_aff obj) { std::swap(this->ptr, obj.ptr); return *this; } -pw_qpolynomial::~pw_qpolynomial() { +pw_multi_aff::~pw_multi_aff() { if (ptr) - isl_pw_qpolynomial_free(ptr); + isl_pw_multi_aff_free(ptr); } -__isl_give isl_pw_qpolynomial *pw_qpolynomial::copy() const & { - return isl_pw_qpolynomial_copy(ptr); +__isl_give isl_pw_multi_aff *pw_multi_aff::copy() const & { + return isl_pw_multi_aff_copy(ptr); } -__isl_keep isl_pw_qpolynomial *pw_qpolynomial::get() const { +__isl_keep isl_pw_multi_aff *pw_multi_aff::get() const { return ptr; } -__isl_give isl_pw_qpolynomial *pw_qpolynomial::release() { - isl_pw_qpolynomial *tmp = ptr; +__isl_give isl_pw_multi_aff *pw_multi_aff::release() { + isl_pw_multi_aff *tmp = ptr; ptr = nullptr; return tmp; } -bool pw_qpolynomial::is_null() const { +bool pw_multi_aff::is_null() const { return ptr == nullptr; } - -isl::ctx pw_qpolynomial::ctx() const { - return isl::ctx(isl_pw_qpolynomial_get_ctx(ptr)); +isl::ctx pw_multi_aff::ctx() const { + return isl::ctx(isl_pw_multi_aff_get_ctx(ptr)); } -void pw_qpolynomial::dump() const { - isl_pw_qpolynomial_dump(get()); +isl::multi_pw_aff pw_multi_aff::add(const isl::multi_pw_aff &multi2) const +{ + return isl::multi_pw_aff(*this).add(multi2); } +isl::multi_union_pw_aff pw_multi_aff::add(const isl::multi_union_pw_aff &multi2) const +{ + return isl::multi_pw_aff(*this).add(multi2); +} -isl::pw_qpolynomial pw_qpolynomial::add(isl::pw_qpolynomial pwqp2) const +isl::pw_multi_aff pw_multi_aff::add(isl::pw_multi_aff pma2) const { - auto res = isl_pw_qpolynomial_add(copy(), pwqp2.release()); + auto res = isl_pw_multi_aff_add(copy(), pma2.release()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::add_dims(isl::dim type, unsigned int n) const +isl::union_pw_multi_aff pw_multi_aff::add(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_pw_qpolynomial_add_dims(copy(), static_cast(type), n); - return manage(res); + return isl::union_pw_multi_aff(*this).add(upma2); } -isl::pw_qpolynomial pw_qpolynomial::alloc(isl::set set, isl::qpolynomial qp) +isl::pw_multi_aff pw_multi_aff::add(const isl::multi_aff &pma2) const { - auto res = isl_pw_qpolynomial_alloc(set.release(), qp.release()); - return manage(res); + return this->add(isl::pw_multi_aff(pma2)); } -isl::qpolynomial pw_qpolynomial::as_qpolynomial() const +isl::pw_multi_aff pw_multi_aff::add(const isl::pw_aff &pma2) const { - auto res = isl_pw_qpolynomial_as_qpolynomial(copy()); + return this->add(isl::pw_multi_aff(pma2)); +} + +isl::pw_multi_aff pw_multi_aff::add_constant(isl::multi_val mv) const +{ + auto res = isl_pw_multi_aff_add_constant_multi_val(copy(), mv.release()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::coalesce() const +isl::pw_multi_aff pw_multi_aff::add_constant(isl::val v) const { - auto res = isl_pw_qpolynomial_coalesce(copy()); + auto res = isl_pw_multi_aff_add_constant_val(copy(), v.release()); return manage(res); } -isl_size pw_qpolynomial::dim(isl::dim type) const +isl::pw_multi_aff pw_multi_aff::add_constant(long v) const { - auto res = isl_pw_qpolynomial_dim(get(), static_cast(type)); - return res; + return this->add_constant(isl::val(ctx(), v)); } -isl::set pw_qpolynomial::domain() const +isl::union_pw_multi_aff pw_multi_aff::add_pw_multi_aff(const isl::pw_multi_aff &pma) const { - auto res = isl_pw_qpolynomial_domain(copy()); - return manage(res); + return isl::union_pw_multi_aff(*this).add_pw_multi_aff(pma); } -isl::pw_qpolynomial pw_qpolynomial::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::union_pw_multi_aff pw_multi_aff::apply(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_pw_qpolynomial_drop_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::union_pw_multi_aff(*this).apply(upma2); } -isl::pw_qpolynomial pw_qpolynomial::drop_unused_params() const +isl::map pw_multi_aff::as_map() const { - auto res = isl_pw_qpolynomial_drop_unused_params(copy()); + auto res = isl_pw_multi_aff_as_map(copy()); return manage(res); } -isl::val pw_qpolynomial::eval(isl::point pnt) const +isl::multi_aff pw_multi_aff::as_multi_aff() const { - auto res = isl_pw_qpolynomial_eval(copy(), pnt.release()); + auto res = isl_pw_multi_aff_as_multi_aff(copy()); return manage(res); } -int pw_qpolynomial::find_dim_by_name(isl::dim type, const std::string &name) const +isl::multi_union_pw_aff pw_multi_aff::as_multi_union_pw_aff() const { - auto res = isl_pw_qpolynomial_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + return isl::union_pw_multi_aff(*this).as_multi_union_pw_aff(); } -isl::pw_qpolynomial pw_qpolynomial::fix_val(isl::dim type, unsigned int n, isl::val v) const +isl::pw_multi_aff pw_multi_aff::as_pw_multi_aff() const { - auto res = isl_pw_qpolynomial_fix_val(copy(), static_cast(type), n, v.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).as_pw_multi_aff(); } -stat pw_qpolynomial::foreach_piece(const std::function &fn) const +isl::set pw_multi_aff::as_set() const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_set *arg_0, isl_qpolynomial *arg_1, void *arg_2) -> isl_stat { - auto *data = static_cast(arg_2); - stat ret = (*data->func)(manage(arg_0), manage(arg_1)); - return ret.release(); - }; - auto res = isl_pw_qpolynomial_foreach_piece(get(), fn_lambda, &fn_data); + auto res = isl_pw_multi_aff_as_set(copy()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::from_pw_aff(isl::pw_aff pwaff) +isl::union_map pw_multi_aff::as_union_map() const { - auto res = isl_pw_qpolynomial_from_pw_aff(pwaff.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).as_union_map(); } -isl::pw_qpolynomial pw_qpolynomial::from_qpolynomial(isl::qpolynomial qp) +isl::pw_aff pw_multi_aff::at(int pos) const { - auto res = isl_pw_qpolynomial_from_qpolynomial(qp.release()); + auto res = isl_pw_multi_aff_get_at(get(), pos); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::from_range() const +isl::pw_aff pw_multi_aff::get_at(int pos) const { - auto res = isl_pw_qpolynomial_from_range(copy()); - return manage(res); + return at(pos); } -isl::space pw_qpolynomial::get_domain_space() const +isl::set pw_multi_aff::bind(const isl::multi_id &tuple) const { - auto res = isl_pw_qpolynomial_get_domain_space(get()); - return manage(res); + return isl::multi_pw_aff(*this).bind(tuple); } -isl::space pw_qpolynomial::get_space() const +isl::pw_multi_aff pw_multi_aff::bind_domain(isl::multi_id tuple) const { - auto res = isl_pw_qpolynomial_get_space(get()); + auto res = isl_pw_multi_aff_bind_domain(copy(), tuple.release()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::gist(isl::set context) const +isl::pw_multi_aff pw_multi_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const { - auto res = isl_pw_qpolynomial_gist(copy(), context.release()); + auto res = isl_pw_multi_aff_bind_domain_wrapped_domain(copy(), tuple.release()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::gist_params(isl::set context) const +isl::pw_multi_aff pw_multi_aff::coalesce() const { - auto res = isl_pw_qpolynomial_gist_params(copy(), context.release()); + auto res = isl_pw_multi_aff_coalesce(copy()); return manage(res); } -boolean pw_qpolynomial::has_equal_space(const isl::pw_qpolynomial &pwqp2) const +class size pw_multi_aff::dim(isl::dim type) const { - auto res = isl_pw_qpolynomial_has_equal_space(get(), pwqp2.get()); + auto res = isl_pw_multi_aff_dim(get(), static_cast(type)); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::insert_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::set pw_multi_aff::domain() const { - auto res = isl_pw_qpolynomial_insert_dims(copy(), static_cast(type), first, n); + auto res = isl_pw_multi_aff_domain(copy()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::intersect_domain(isl::set set) const +isl::pw_multi_aff pw_multi_aff::domain_map(isl::space space) { - auto res = isl_pw_qpolynomial_intersect_domain(copy(), set.release()); + auto res = isl_pw_multi_aff_domain_map(space.release()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::intersect_domain_wrapped_domain(isl::set set) const +isl::pw_multi_aff pw_multi_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_pw_qpolynomial_intersect_domain_wrapped_domain(copy(), set.release()); + auto res = isl_pw_multi_aff_drop_dims(copy(), static_cast(type), first, n); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::intersect_domain_wrapped_range(isl::set set) const +isl::pw_multi_aff pw_multi_aff::extract_pw_multi_aff(const isl::space &space) const { - auto res = isl_pw_qpolynomial_intersect_domain_wrapped_range(copy(), set.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).extract_pw_multi_aff(space); } -isl::pw_qpolynomial pw_qpolynomial::intersect_params(isl::set set) const +isl::multi_pw_aff pw_multi_aff::flat_range_product(const isl::multi_pw_aff &multi2) const { - auto res = isl_pw_qpolynomial_intersect_params(copy(), set.release()); - return manage(res); + return isl::multi_pw_aff(*this).flat_range_product(multi2); } -boolean pw_qpolynomial::involves_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::multi_union_pw_aff pw_multi_aff::flat_range_product(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_pw_qpolynomial_involves_dims(get(), static_cast(type), first, n); - return manage(res); + return isl::multi_pw_aff(*this).flat_range_product(multi2); } -boolean pw_qpolynomial::involves_nan() const +isl::pw_multi_aff pw_multi_aff::flat_range_product(isl::pw_multi_aff pma2) const { - auto res = isl_pw_qpolynomial_involves_nan(get()); + auto res = isl_pw_multi_aff_flat_range_product(copy(), pma2.release()); return manage(res); } -boolean pw_qpolynomial::involves_param_id(const isl::id &id) const +isl::union_pw_multi_aff pw_multi_aff::flat_range_product(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_pw_qpolynomial_involves_param_id(get(), id.get()); - return manage(res); + return isl::union_pw_multi_aff(*this).flat_range_product(upma2); } -boolean pw_qpolynomial::is_zero() const +isl::pw_multi_aff pw_multi_aff::flat_range_product(const isl::multi_aff &pma2) const { - auto res = isl_pw_qpolynomial_is_zero(get()); - return manage(res); + return this->flat_range_product(isl::pw_multi_aff(pma2)); } -boolean pw_qpolynomial::isa_qpolynomial() const +isl::pw_multi_aff pw_multi_aff::flat_range_product(const isl::pw_aff &pma2) const { - auto res = isl_pw_qpolynomial_isa_qpolynomial(get()); - return manage(res); + return this->flat_range_product(isl::pw_multi_aff(pma2)); } -isl::val pw_qpolynomial::max() const +stat pw_multi_aff::foreach_piece(const std::function &fn) const { - auto res = isl_pw_qpolynomial_max(copy()); + struct fn_data { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_set *arg_0, isl_multi_aff *arg_1, void *arg_2) -> isl_stat { + auto *data = static_cast(arg_2); + auto ret = (data->func)(manage(arg_0), manage(arg_1)); + return ret.release(); + }; + auto res = isl_pw_multi_aff_foreach_piece(get(), fn_lambda, &fn_data); return manage(res); } -isl::val pw_qpolynomial::min() const +isl::pw_multi_aff pw_multi_aff::from_map(isl::map map) { - auto res = isl_pw_qpolynomial_min(copy()); + auto res = isl_pw_multi_aff_from_map(map.release()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const +isl::pw_multi_aff pw_multi_aff::gist(isl::set set) const { - auto res = isl_pw_qpolynomial_move_dims(copy(), static_cast(dst_type), dst_pos, static_cast(src_type), src_pos, n); + auto res = isl_pw_multi_aff_gist(copy(), set.release()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::mul(isl::pw_qpolynomial pwqp2) const +isl::union_pw_multi_aff pw_multi_aff::gist(const isl::union_set &context) const { - auto res = isl_pw_qpolynomial_mul(copy(), pwqp2.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).gist(context); } -isl_size pw_qpolynomial::n_piece() const +isl::pw_multi_aff pw_multi_aff::gist(const isl::basic_set &set) const { - auto res = isl_pw_qpolynomial_n_piece(get()); - return res; + return this->gist(isl::set(set)); } -isl::pw_qpolynomial pw_qpolynomial::neg() const +isl::pw_multi_aff pw_multi_aff::gist(const isl::point &set) const { - auto res = isl_pw_qpolynomial_neg(copy()); - return manage(res); + return this->gist(isl::set(set)); } -boolean pw_qpolynomial::plain_is_equal(const isl::pw_qpolynomial &pwqp2) const +boolean pw_multi_aff::has_range_tuple_id() const { - auto res = isl_pw_qpolynomial_plain_is_equal(get(), pwqp2.get()); + auto res = isl_pw_multi_aff_has_range_tuple_id(get()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::pow(unsigned int exponent) const +isl::multi_pw_aff pw_multi_aff::identity() const { - auto res = isl_pw_qpolynomial_pow(copy(), exponent); - return manage(res); + return isl::multi_pw_aff(*this).identity(); } -isl::pw_qpolynomial pw_qpolynomial::project_domain_on_params() const +isl::pw_multi_aff pw_multi_aff::identity_on_domain(isl::space space) { - auto res = isl_pw_qpolynomial_project_domain_on_params(copy()); + auto res = isl_pw_multi_aff_identity_on_domain_space(space.release()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::reset_domain_space(isl::space space) const +isl::pw_multi_aff pw_multi_aff::insert_domain(isl::space domain) const { - auto res = isl_pw_qpolynomial_reset_domain_space(copy(), space.release()); + auto res = isl_pw_multi_aff_insert_domain(copy(), domain.release()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::reset_user() const +isl::pw_multi_aff pw_multi_aff::intersect_domain(isl::set set) const { - auto res = isl_pw_qpolynomial_reset_user(copy()); + auto res = isl_pw_multi_aff_intersect_domain(copy(), set.release()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::scale_down_val(isl::val v) const +isl::union_pw_multi_aff pw_multi_aff::intersect_domain(const isl::space &space) const { - auto res = isl_pw_qpolynomial_scale_down_val(copy(), v.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).intersect_domain(space); } -isl::pw_qpolynomial pw_qpolynomial::scale_val(isl::val v) const +isl::union_pw_multi_aff pw_multi_aff::intersect_domain(const isl::union_set &uset) const { - auto res = isl_pw_qpolynomial_scale_val(copy(), v.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).intersect_domain(uset); } -isl::pw_qpolynomial pw_qpolynomial::split_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::pw_multi_aff pw_multi_aff::intersect_domain(const isl::basic_set &set) const { - auto res = isl_pw_qpolynomial_split_dims(copy(), static_cast(type), first, n); - return manage(res); + return this->intersect_domain(isl::set(set)); } -isl::pw_qpolynomial pw_qpolynomial::split_periods(int max_periods) const +isl::pw_multi_aff pw_multi_aff::intersect_domain(const isl::point &set) const { - auto res = isl_pw_qpolynomial_split_periods(copy(), max_periods); - return manage(res); + return this->intersect_domain(isl::set(set)); } -isl::pw_qpolynomial pw_qpolynomial::sub(isl::pw_qpolynomial pwqp2) const +isl::union_pw_multi_aff pw_multi_aff::intersect_domain_wrapped_domain(const isl::union_set &uset) const { - auto res = isl_pw_qpolynomial_sub(copy(), pwqp2.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).intersect_domain_wrapped_domain(uset); } -isl::pw_qpolynomial pw_qpolynomial::subtract_domain(isl::set set) const +isl::union_pw_multi_aff pw_multi_aff::intersect_domain_wrapped_range(const isl::union_set &uset) const { - auto res = isl_pw_qpolynomial_subtract_domain(copy(), set.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).intersect_domain_wrapped_range(uset); } -isl::pw_qpolynomial pw_qpolynomial::to_polynomial(int sign) const +isl::pw_multi_aff pw_multi_aff::intersect_params(isl::set set) const { - auto res = isl_pw_qpolynomial_to_polynomial(copy(), sign); + auto res = isl_pw_multi_aff_intersect_params(copy(), set.release()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial::zero(isl::space space) +boolean pw_multi_aff::involves_locals() const { - auto res = isl_pw_qpolynomial_zero(space.release()); + auto res = isl_pw_multi_aff_involves_locals(get()); return manage(res); } -// implementations for isl::pw_qpolynomial_fold_list -pw_qpolynomial_fold_list manage(__isl_take isl_pw_qpolynomial_fold_list *ptr) { - return pw_qpolynomial_fold_list(ptr); -} -pw_qpolynomial_fold_list manage_copy(__isl_keep isl_pw_qpolynomial_fold_list *ptr) { - ptr = isl_pw_qpolynomial_fold_list_copy(ptr); - return pw_qpolynomial_fold_list(ptr); +boolean pw_multi_aff::involves_nan() const +{ + return isl::multi_pw_aff(*this).involves_nan(); } -pw_qpolynomial_fold_list::pw_qpolynomial_fold_list() - : ptr(nullptr) {} - -pw_qpolynomial_fold_list::pw_qpolynomial_fold_list(const pw_qpolynomial_fold_list &obj) - : ptr(nullptr) +boolean pw_multi_aff::involves_param(const isl::id &id) const { - ptr = obj.copy(); + return isl::multi_pw_aff(*this).involves_param(id); } - -pw_qpolynomial_fold_list::pw_qpolynomial_fold_list(__isl_take isl_pw_qpolynomial_fold_list *ptr) - : ptr(ptr) {} - - -pw_qpolynomial_fold_list &pw_qpolynomial_fold_list::operator=(pw_qpolynomial_fold_list obj) { - std::swap(this->ptr, obj.ptr); - return *this; +boolean pw_multi_aff::involves_param(const std::string &id) const +{ + return this->involves_param(isl::id(ctx(), id)); } -pw_qpolynomial_fold_list::~pw_qpolynomial_fold_list() { - if (ptr) - isl_pw_qpolynomial_fold_list_free(ptr); +boolean pw_multi_aff::involves_param(const isl::id_list &list) const +{ + return isl::multi_pw_aff(*this).involves_param(list); } -__isl_give isl_pw_qpolynomial_fold_list *pw_qpolynomial_fold_list::copy() const & { - return isl_pw_qpolynomial_fold_list_copy(ptr); +boolean pw_multi_aff::isa_multi_aff() const +{ + auto res = isl_pw_multi_aff_isa_multi_aff(get()); + return manage(res); } -__isl_keep isl_pw_qpolynomial_fold_list *pw_qpolynomial_fold_list::get() const { - return ptr; +boolean pw_multi_aff::isa_pw_multi_aff() const +{ + return isl::union_pw_multi_aff(*this).isa_pw_multi_aff(); } -__isl_give isl_pw_qpolynomial_fold_list *pw_qpolynomial_fold_list::release() { - isl_pw_qpolynomial_fold_list *tmp = ptr; - ptr = nullptr; - return tmp; +isl::pw_aff_list pw_multi_aff::list() const +{ + return isl::multi_pw_aff(*this).list(); } -bool pw_qpolynomial_fold_list::is_null() const { - return ptr == nullptr; +isl::multi_pw_aff pw_multi_aff::max(const isl::multi_pw_aff &multi2) const +{ + return isl::multi_pw_aff(*this).max(multi2); } - -isl::ctx pw_qpolynomial_fold_list::ctx() const { - return isl::ctx(isl_pw_qpolynomial_fold_list_get_ctx(ptr)); +isl::multi_val pw_multi_aff::max_multi_val() const +{ + auto res = isl_pw_multi_aff_max_multi_val(copy()); + return manage(res); } -void pw_qpolynomial_fold_list::dump() const { - isl_pw_qpolynomial_fold_list_dump(get()); +isl::multi_pw_aff pw_multi_aff::min(const isl::multi_pw_aff &multi2) const +{ + return isl::multi_pw_aff(*this).min(multi2); } - - -// implementations for isl::pw_qpolynomial_list -pw_qpolynomial_list manage(__isl_take isl_pw_qpolynomial_list *ptr) { - return pw_qpolynomial_list(ptr); -} -pw_qpolynomial_list manage_copy(__isl_keep isl_pw_qpolynomial_list *ptr) { - ptr = isl_pw_qpolynomial_list_copy(ptr); - return pw_qpolynomial_list(ptr); +isl::multi_val pw_multi_aff::min_multi_val() const +{ + auto res = isl_pw_multi_aff_min_multi_val(copy()); + return manage(res); } -pw_qpolynomial_list::pw_qpolynomial_list() - : ptr(nullptr) {} - -pw_qpolynomial_list::pw_qpolynomial_list(const pw_qpolynomial_list &obj) - : ptr(nullptr) +isl::pw_multi_aff pw_multi_aff::multi_val_on_domain(isl::set domain, isl::multi_val mv) { - ptr = obj.copy(); + auto res = isl_pw_multi_aff_multi_val_on_domain(domain.release(), mv.release()); + return manage(res); } - -pw_qpolynomial_list::pw_qpolynomial_list(__isl_take isl_pw_qpolynomial_list *ptr) - : ptr(ptr) {} - - -pw_qpolynomial_list &pw_qpolynomial_list::operator=(pw_qpolynomial_list obj) { - std::swap(this->ptr, obj.ptr); - return *this; +class size pw_multi_aff::n_piece() const +{ + auto res = isl_pw_multi_aff_n_piece(get()); + return manage(res); } -pw_qpolynomial_list::~pw_qpolynomial_list() { - if (ptr) - isl_pw_qpolynomial_list_free(ptr); +isl::multi_pw_aff pw_multi_aff::neg() const +{ + return isl::multi_pw_aff(*this).neg(); } -__isl_give isl_pw_qpolynomial_list *pw_qpolynomial_list::copy() const & { - return isl_pw_qpolynomial_list_copy(ptr); +boolean pw_multi_aff::plain_is_empty() const +{ + return isl::union_pw_multi_aff(*this).plain_is_empty(); } -__isl_keep isl_pw_qpolynomial_list *pw_qpolynomial_list::get() const { - return ptr; +boolean pw_multi_aff::plain_is_equal(const isl::multi_pw_aff &multi2) const +{ + return isl::multi_pw_aff(*this).plain_is_equal(multi2); } -__isl_give isl_pw_qpolynomial_list *pw_qpolynomial_list::release() { - isl_pw_qpolynomial_list *tmp = ptr; - ptr = nullptr; - return tmp; +boolean pw_multi_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const +{ + return isl::multi_pw_aff(*this).plain_is_equal(multi2); } -bool pw_qpolynomial_list::is_null() const { - return ptr == nullptr; +isl::pw_multi_aff pw_multi_aff::preimage_domain_wrapped_domain(isl::pw_multi_aff pma2) const +{ + auto res = isl_pw_multi_aff_preimage_domain_wrapped_domain_pw_multi_aff(copy(), pma2.release()); + return manage(res); } - -isl::ctx pw_qpolynomial_list::ctx() const { - return isl::ctx(isl_pw_qpolynomial_list_get_ctx(ptr)); +isl::union_pw_multi_aff pw_multi_aff::preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const +{ + return isl::union_pw_multi_aff(*this).preimage_domain_wrapped_domain(upma2); } -void pw_qpolynomial_list::dump() const { - isl_pw_qpolynomial_list_dump(get()); +isl::pw_multi_aff pw_multi_aff::preimage_domain_wrapped_domain(const isl::multi_aff &pma2) const +{ + return this->preimage_domain_wrapped_domain(isl::pw_multi_aff(pma2)); } - -isl::pw_qpolynomial_list pw_qpolynomial_list::add(isl::pw_qpolynomial el) const +isl::pw_multi_aff pw_multi_aff::preimage_domain_wrapped_domain(const isl::pw_aff &pma2) const { - auto res = isl_pw_qpolynomial_list_add(copy(), el.release()); - return manage(res); + return this->preimage_domain_wrapped_domain(isl::pw_multi_aff(pma2)); } -isl::pw_qpolynomial_list pw_qpolynomial_list::alloc(isl::ctx ctx, int n) +isl::multi_pw_aff pw_multi_aff::product(const isl::multi_pw_aff &multi2) const { - auto res = isl_pw_qpolynomial_list_alloc(ctx.release(), n); - return manage(res); + return isl::multi_pw_aff(*this).product(multi2); } -isl::pw_qpolynomial_list pw_qpolynomial_list::clear() const +isl::pw_multi_aff pw_multi_aff::product(isl::pw_multi_aff pma2) const { - auto res = isl_pw_qpolynomial_list_clear(copy()); + auto res = isl_pw_multi_aff_product(copy(), pma2.release()); return manage(res); } -isl::pw_qpolynomial_list pw_qpolynomial_list::concat(isl::pw_qpolynomial_list list2) const +isl::pw_multi_aff pw_multi_aff::product(const isl::multi_aff &pma2) const { - auto res = isl_pw_qpolynomial_list_concat(copy(), list2.release()); - return manage(res); + return this->product(isl::pw_multi_aff(pma2)); } -isl::pw_qpolynomial_list pw_qpolynomial_list::drop(unsigned int first, unsigned int n) const +isl::pw_multi_aff pw_multi_aff::product(const isl::pw_aff &pma2) const { - auto res = isl_pw_qpolynomial_list_drop(copy(), first, n); - return manage(res); + return this->product(isl::pw_multi_aff(pma2)); } -stat pw_qpolynomial_list::foreach(const std::function &fn) const +isl::pw_multi_aff pw_multi_aff::project_out_map(isl::space space, isl::dim type, unsigned int first, unsigned int n) { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_pw_qpolynomial *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_pw_qpolynomial_list_foreach(get(), fn_lambda, &fn_data); + auto res = isl_pw_multi_aff_project_out_map(space.release(), static_cast(type), first, n); return manage(res); } -isl::pw_qpolynomial_list pw_qpolynomial_list::from_pw_qpolynomial(isl::pw_qpolynomial el) +isl::multi_pw_aff pw_multi_aff::pullback(const isl::multi_pw_aff &mpa2) const { - auto res = isl_pw_qpolynomial_list_from_pw_qpolynomial(el.release()); - return manage(res); + return isl::multi_pw_aff(*this).pullback(mpa2); } -isl::pw_qpolynomial pw_qpolynomial_list::get_at(int index) const +isl::pw_multi_aff pw_multi_aff::pullback(isl::multi_aff ma) const { - auto res = isl_pw_qpolynomial_list_get_at(get(), index); + auto res = isl_pw_multi_aff_pullback_multi_aff(copy(), ma.release()); return manage(res); } -isl::pw_qpolynomial pw_qpolynomial_list::get_pw_qpolynomial(int index) const +isl::pw_multi_aff pw_multi_aff::pullback(isl::pw_multi_aff pma2) const { - auto res = isl_pw_qpolynomial_list_get_pw_qpolynomial(get(), index); + auto res = isl_pw_multi_aff_pullback_pw_multi_aff(copy(), pma2.release()); return manage(res); } -isl::pw_qpolynomial_list pw_qpolynomial_list::insert(unsigned int pos, isl::pw_qpolynomial el) const +isl::union_pw_multi_aff pw_multi_aff::pullback(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_pw_qpolynomial_list_insert(copy(), pos, el.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).pullback(upma2); } -isl_size pw_qpolynomial_list::n_pw_qpolynomial() const +isl::pw_multi_aff_list pw_multi_aff::pw_multi_aff_list() const { - auto res = isl_pw_qpolynomial_list_n_pw_qpolynomial(get()); - return res; + return isl::union_pw_multi_aff(*this).pw_multi_aff_list(); } -isl::pw_qpolynomial_list pw_qpolynomial_list::reverse() const +isl::pw_multi_aff pw_multi_aff::range_factor_domain() const { - auto res = isl_pw_qpolynomial_list_reverse(copy()); + auto res = isl_pw_multi_aff_range_factor_domain(copy()); return manage(res); } -isl::pw_qpolynomial_list pw_qpolynomial_list::set_pw_qpolynomial(int index, isl::pw_qpolynomial el) const +isl::pw_multi_aff pw_multi_aff::range_factor_range() const { - auto res = isl_pw_qpolynomial_list_set_pw_qpolynomial(copy(), index, el.release()); + auto res = isl_pw_multi_aff_range_factor_range(copy()); return manage(res); } -isl_size pw_qpolynomial_list::size() const +isl::pw_multi_aff pw_multi_aff::range_map(isl::space space) { - auto res = isl_pw_qpolynomial_list_size(get()); - return res; + auto res = isl_pw_multi_aff_range_map(space.release()); + return manage(res); } -isl::pw_qpolynomial_list pw_qpolynomial_list::swap(unsigned int pos1, unsigned int pos2) const +isl::multi_pw_aff pw_multi_aff::range_product(const isl::multi_pw_aff &multi2) const { - auto res = isl_pw_qpolynomial_list_swap(copy(), pos1, pos2); - return manage(res); + return isl::multi_pw_aff(*this).range_product(multi2); } -// implementations for isl::qpolynomial -qpolynomial manage(__isl_take isl_qpolynomial *ptr) { - return qpolynomial(ptr); -} -qpolynomial manage_copy(__isl_keep isl_qpolynomial *ptr) { - ptr = isl_qpolynomial_copy(ptr); - return qpolynomial(ptr); +isl::multi_union_pw_aff pw_multi_aff::range_product(const isl::multi_union_pw_aff &multi2) const +{ + return isl::multi_pw_aff(*this).range_product(multi2); } -qpolynomial::qpolynomial() - : ptr(nullptr) {} - -qpolynomial::qpolynomial(const qpolynomial &obj) - : ptr(nullptr) +isl::pw_multi_aff pw_multi_aff::range_product(isl::pw_multi_aff pma2) const { - ptr = obj.copy(); + auto res = isl_pw_multi_aff_range_product(copy(), pma2.release()); + return manage(res); } - -qpolynomial::qpolynomial(__isl_take isl_qpolynomial *ptr) - : ptr(ptr) {} - - -qpolynomial &qpolynomial::operator=(qpolynomial obj) { - std::swap(this->ptr, obj.ptr); - return *this; +isl::union_pw_multi_aff pw_multi_aff::range_product(const isl::union_pw_multi_aff &upma2) const +{ + return isl::union_pw_multi_aff(*this).range_product(upma2); } -qpolynomial::~qpolynomial() { - if (ptr) - isl_qpolynomial_free(ptr); +isl::pw_multi_aff pw_multi_aff::range_product(const isl::multi_aff &pma2) const +{ + return this->range_product(isl::pw_multi_aff(pma2)); } -__isl_give isl_qpolynomial *qpolynomial::copy() const & { - return isl_qpolynomial_copy(ptr); +isl::pw_multi_aff pw_multi_aff::range_product(const isl::pw_aff &pma2) const +{ + return this->range_product(isl::pw_multi_aff(pma2)); } -__isl_keep isl_qpolynomial *qpolynomial::get() const { - return ptr; +isl::id pw_multi_aff::range_tuple_id() const +{ + auto res = isl_pw_multi_aff_get_range_tuple_id(get()); + return manage(res); } -__isl_give isl_qpolynomial *qpolynomial::release() { - isl_qpolynomial *tmp = ptr; - ptr = nullptr; - return tmp; +isl::id pw_multi_aff::get_range_tuple_id() const +{ + return range_tuple_id(); } -bool qpolynomial::is_null() const { - return ptr == nullptr; +isl::multi_pw_aff pw_multi_aff::reset_range_tuple_id() const +{ + return isl::multi_pw_aff(*this).reset_range_tuple_id(); } - -isl::ctx qpolynomial::ctx() const { - return isl::ctx(isl_qpolynomial_get_ctx(ptr)); +isl::multi_pw_aff pw_multi_aff::reset_tuple_id(isl::dim type) const +{ + return isl::multi_pw_aff(*this).reset_tuple_id(type); } -void qpolynomial::dump() const { - isl_qpolynomial_dump(get()); +isl::multi_pw_aff pw_multi_aff::scale(const isl::multi_val &mv) const +{ + return isl::multi_pw_aff(*this).scale(mv); } - -isl::qpolynomial qpolynomial::add(isl::qpolynomial qp2) const +isl::pw_multi_aff pw_multi_aff::scale(isl::val v) const { - auto res = isl_qpolynomial_add(copy(), qp2.release()); + auto res = isl_pw_multi_aff_scale_val(copy(), v.release()); return manage(res); } -isl::qpolynomial qpolynomial::add_dims(isl::dim type, unsigned int n) const +isl::pw_multi_aff pw_multi_aff::scale(long v) const { - auto res = isl_qpolynomial_add_dims(copy(), static_cast(type), n); - return manage(res); + return this->scale(isl::val(ctx(), v)); } -isl::qpolynomial qpolynomial::align_params(isl::space model) const +isl::multi_pw_aff pw_multi_aff::scale_down(const isl::multi_val &mv) const { - auto res = isl_qpolynomial_align_params(copy(), model.release()); - return manage(res); + return isl::multi_pw_aff(*this).scale_down(mv); } -stat qpolynomial::as_polynomial_on_domain(const isl::basic_set &bset, const std::function &fn) const +isl::pw_multi_aff pw_multi_aff::scale_down(isl::val v) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_basic_set *arg_0, isl_qpolynomial *arg_1, void *arg_2) -> isl_stat { - auto *data = static_cast(arg_2); - stat ret = (*data->func)(manage(arg_0), manage(arg_1)); - return ret.release(); - }; - auto res = isl_qpolynomial_as_polynomial_on_domain(get(), bset.get(), fn_lambda, &fn_data); + auto res = isl_pw_multi_aff_scale_down_val(copy(), v.release()); return manage(res); } -isl_size qpolynomial::dim(isl::dim type) const +isl::pw_multi_aff pw_multi_aff::scale_down(long v) const { - auto res = isl_qpolynomial_dim(get(), static_cast(type)); - return res; + return this->scale_down(isl::val(ctx(), v)); } -isl::qpolynomial qpolynomial::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::multi_pw_aff pw_multi_aff::set_at(int pos, const isl::pw_aff &el) const { - auto res = isl_qpolynomial_drop_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::multi_pw_aff(*this).set_at(pos, el); } -isl::val qpolynomial::eval(isl::point pnt) const +isl::multi_union_pw_aff pw_multi_aff::set_at(int pos, const isl::union_pw_aff &el) const { - auto res = isl_qpolynomial_eval(copy(), pnt.release()); - return manage(res); + return isl::multi_pw_aff(*this).set_at(pos, el); } -stat qpolynomial::foreach_term(const std::function &fn) const +isl::multi_pw_aff pw_multi_aff::set_pw_aff(int pos, const isl::pw_aff &el) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_term *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_qpolynomial_foreach_term(get(), fn_lambda, &fn_data); - return manage(res); + return isl::multi_pw_aff(*this).set_pw_aff(pos, el); } -isl::qpolynomial qpolynomial::from_aff(isl::aff aff) +isl::pw_multi_aff pw_multi_aff::set_pw_aff(unsigned int pos, isl::pw_aff pa) const { - auto res = isl_qpolynomial_from_aff(aff.release()); + auto res = isl_pw_multi_aff_set_pw_aff(copy(), pos, pa.release()); return manage(res); } -isl::qpolynomial qpolynomial::from_constraint(isl::constraint c, isl::dim type, unsigned int pos) +isl::pw_multi_aff pw_multi_aff::set_range_tuple(isl::id id) const { - auto res = isl_qpolynomial_from_constraint(c.release(), static_cast(type), pos); + auto res = isl_pw_multi_aff_set_range_tuple_id(copy(), id.release()); return manage(res); } -isl::qpolynomial qpolynomial::from_term(isl::term term) +isl::pw_multi_aff pw_multi_aff::set_range_tuple(const std::string &id) const { - auto res = isl_qpolynomial_from_term(term.release()); - return manage(res); + return this->set_range_tuple(isl::id(ctx(), id)); } -isl::val qpolynomial::get_constant_val() const +isl::multi_union_pw_aff pw_multi_aff::set_union_pw_aff(int pos, const isl::union_pw_aff &el) const { - auto res = isl_qpolynomial_get_constant_val(get()); - return manage(res); + return isl::multi_pw_aff(*this).set_union_pw_aff(pos, el); } -isl::space qpolynomial::get_domain_space() const +class size pw_multi_aff::size() const { - auto res = isl_qpolynomial_get_domain_space(get()); - return manage(res); + return isl::multi_pw_aff(*this).size(); } -isl::space qpolynomial::get_space() const +isl::space pw_multi_aff::space() const { - auto res = isl_qpolynomial_get_space(get()); + auto res = isl_pw_multi_aff_get_space(get()); return manage(res); } -isl::qpolynomial qpolynomial::gist(isl::set context) const +isl::space pw_multi_aff::get_space() const { - auto res = isl_qpolynomial_gist(copy(), context.release()); - return manage(res); + return space(); } -isl::qpolynomial qpolynomial::gist_params(isl::set context) const +isl::multi_pw_aff pw_multi_aff::sub(const isl::multi_pw_aff &multi2) const { - auto res = isl_qpolynomial_gist_params(copy(), context.release()); - return manage(res); + return isl::multi_pw_aff(*this).sub(multi2); } -isl::qpolynomial qpolynomial::homogenize() const +isl::multi_union_pw_aff pw_multi_aff::sub(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_qpolynomial_homogenize(copy()); - return manage(res); + return isl::multi_pw_aff(*this).sub(multi2); } -isl::qpolynomial qpolynomial::infty_on_domain(isl::space domain) +isl::pw_multi_aff pw_multi_aff::sub(isl::pw_multi_aff pma2) const { - auto res = isl_qpolynomial_infty_on_domain(domain.release()); + auto res = isl_pw_multi_aff_sub(copy(), pma2.release()); return manage(res); } -isl::qpolynomial qpolynomial::insert_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::union_pw_multi_aff pw_multi_aff::sub(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_qpolynomial_insert_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::union_pw_multi_aff(*this).sub(upma2); } -boolean qpolynomial::involves_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::pw_multi_aff pw_multi_aff::sub(const isl::multi_aff &pma2) const { - auto res = isl_qpolynomial_involves_dims(get(), static_cast(type), first, n); - return manage(res); + return this->sub(isl::pw_multi_aff(pma2)); } -boolean qpolynomial::is_infty() const +isl::pw_multi_aff pw_multi_aff::sub(const isl::pw_aff &pma2) const { - auto res = isl_qpolynomial_is_infty(get()); - return manage(res); + return this->sub(isl::pw_multi_aff(pma2)); } -boolean qpolynomial::is_nan() const +isl::pw_multi_aff pw_multi_aff::subtract_domain(isl::set set) const { - auto res = isl_qpolynomial_is_nan(get()); + auto res = isl_pw_multi_aff_subtract_domain(copy(), set.release()); return manage(res); } -boolean qpolynomial::is_neginfty() const +isl::union_pw_multi_aff pw_multi_aff::subtract_domain(const isl::space &space) const { - auto res = isl_qpolynomial_is_neginfty(get()); - return manage(res); + return isl::union_pw_multi_aff(*this).subtract_domain(space); } -boolean qpolynomial::is_zero() const +isl::union_pw_multi_aff pw_multi_aff::subtract_domain(const isl::union_set &uset) const { - auto res = isl_qpolynomial_is_zero(get()); - return manage(res); + return isl::union_pw_multi_aff(*this).subtract_domain(uset); } -isl::qpolynomial qpolynomial::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const +isl::pw_multi_aff pw_multi_aff::subtract_domain(const isl::basic_set &set) const { - auto res = isl_qpolynomial_move_dims(copy(), static_cast(dst_type), dst_pos, static_cast(src_type), src_pos, n); - return manage(res); + return this->subtract_domain(isl::set(set)); } -isl::qpolynomial qpolynomial::mul(isl::qpolynomial qp2) const +isl::pw_multi_aff pw_multi_aff::subtract_domain(const isl::point &set) const { - auto res = isl_qpolynomial_mul(copy(), qp2.release()); - return manage(res); + return this->subtract_domain(isl::set(set)); } -isl::qpolynomial qpolynomial::nan_on_domain(isl::space domain) +isl::pw_multi_aff_list pw_multi_aff::to_list() const { - auto res = isl_qpolynomial_nan_on_domain(domain.release()); + auto res = isl_pw_multi_aff_to_list(copy()); return manage(res); } -isl::qpolynomial qpolynomial::neg() const +isl::multi_pw_aff pw_multi_aff::to_multi_pw_aff() const { - auto res = isl_qpolynomial_neg(copy()); + auto res = isl_pw_multi_aff_to_multi_pw_aff(copy()); return manage(res); } -isl::qpolynomial qpolynomial::neginfty_on_domain(isl::space domain) +isl::union_pw_multi_aff pw_multi_aff::to_union_pw_multi_aff() const { - auto res = isl_qpolynomial_neginfty_on_domain(domain.release()); + auto res = isl_pw_multi_aff_to_union_pw_multi_aff(copy()); return manage(res); } -isl::qpolynomial qpolynomial::one_on_domain(isl::space domain) +isl::id pw_multi_aff::tuple_id(isl::dim type) const { - auto res = isl_qpolynomial_one_on_domain(domain.release()); + auto res = isl_pw_multi_aff_get_tuple_id(get(), static_cast(type)); return manage(res); } -boolean qpolynomial::plain_is_equal(const isl::qpolynomial &qp2) const +isl::id pw_multi_aff::get_tuple_id(isl::dim type) const { - auto res = isl_qpolynomial_plain_is_equal(get(), qp2.get()); - return manage(res); + return tuple_id(type); } -isl::qpolynomial qpolynomial::pow(unsigned int power) const +isl::multi_pw_aff pw_multi_aff::unbind_params_insert_domain(const isl::multi_id &domain) const { - auto res = isl_qpolynomial_pow(copy(), power); - return manage(res); + return isl::multi_pw_aff(*this).unbind_params_insert_domain(domain); } -isl::qpolynomial qpolynomial::project_domain_on_params() const +isl::multi_pw_aff pw_multi_aff::union_add(const isl::multi_pw_aff &mpa2) const { - auto res = isl_qpolynomial_project_domain_on_params(copy()); - return manage(res); + return isl::multi_pw_aff(*this).union_add(mpa2); } -isl::qpolynomial qpolynomial::scale_down_val(isl::val v) const +isl::multi_union_pw_aff pw_multi_aff::union_add(const isl::multi_union_pw_aff &mupa2) const { - auto res = isl_qpolynomial_scale_down_val(copy(), v.release()); - return manage(res); + return isl::multi_pw_aff(*this).union_add(mupa2); } -isl::qpolynomial qpolynomial::scale_val(isl::val v) const +isl::pw_multi_aff pw_multi_aff::union_add(isl::pw_multi_aff pma2) const { - auto res = isl_qpolynomial_scale_val(copy(), v.release()); + auto res = isl_pw_multi_aff_union_add(copy(), pma2.release()); return manage(res); } -int qpolynomial::sgn() const +isl::union_pw_multi_aff pw_multi_aff::union_add(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_qpolynomial_sgn(get()); - return res; + return isl::union_pw_multi_aff(*this).union_add(upma2); } -isl::qpolynomial qpolynomial::sub(isl::qpolynomial qp2) const +isl::pw_multi_aff pw_multi_aff::union_add(const isl::multi_aff &pma2) const { - auto res = isl_qpolynomial_sub(copy(), qp2.release()); - return manage(res); + return this->union_add(isl::pw_multi_aff(pma2)); } -isl::qpolynomial qpolynomial::val_on_domain(isl::space space, isl::val val) +isl::pw_multi_aff pw_multi_aff::union_add(const isl::pw_aff &pma2) const { - auto res = isl_qpolynomial_val_on_domain(space.release(), val.release()); - return manage(res); + return this->union_add(isl::pw_multi_aff(pma2)); } -isl::qpolynomial qpolynomial::var_on_domain(isl::space domain, isl::dim type, unsigned int pos) +isl::pw_multi_aff pw_multi_aff::zero(isl::space space) { - auto res = isl_qpolynomial_var_on_domain(domain.release(), static_cast(type), pos); + auto res = isl_pw_multi_aff_zero(space.release()); return manage(res); } -isl::qpolynomial qpolynomial::zero_on_domain(isl::space domain) +inline std::ostream &operator<<(std::ostream &os, const pw_multi_aff &obj) { - auto res = isl_qpolynomial_zero_on_domain(domain.release()); - return manage(res); + char *str = isl_pw_multi_aff_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -// implementations for isl::qpolynomial_list -qpolynomial_list manage(__isl_take isl_qpolynomial_list *ptr) { - return qpolynomial_list(ptr); +// implementations for isl::pw_multi_aff_list +pw_multi_aff_list manage(__isl_take isl_pw_multi_aff_list *ptr) { + return pw_multi_aff_list(ptr); } -qpolynomial_list manage_copy(__isl_keep isl_qpolynomial_list *ptr) { - ptr = isl_qpolynomial_list_copy(ptr); - return qpolynomial_list(ptr); +pw_multi_aff_list manage_copy(__isl_keep isl_pw_multi_aff_list *ptr) { + ptr = isl_pw_multi_aff_list_copy(ptr); + return pw_multi_aff_list(ptr); } -qpolynomial_list::qpolynomial_list() +pw_multi_aff_list::pw_multi_aff_list() : ptr(nullptr) {} -qpolynomial_list::qpolynomial_list(const qpolynomial_list &obj) +pw_multi_aff_list::pw_multi_aff_list(const pw_multi_aff_list &obj) : ptr(nullptr) { ptr = obj.copy(); } - -qpolynomial_list::qpolynomial_list(__isl_take isl_qpolynomial_list *ptr) +pw_multi_aff_list::pw_multi_aff_list(__isl_take isl_pw_multi_aff_list *ptr) : ptr(ptr) {} +pw_multi_aff_list::pw_multi_aff_list(isl::ctx ctx, int n) +{ + auto res = isl_pw_multi_aff_list_alloc(ctx.release(), n); + ptr = res; +} + +pw_multi_aff_list::pw_multi_aff_list(isl::pw_multi_aff el) +{ + auto res = isl_pw_multi_aff_list_from_pw_multi_aff(el.release()); + ptr = res; +} + +pw_multi_aff_list::pw_multi_aff_list(isl::ctx ctx, const std::string &str) +{ + auto res = isl_pw_multi_aff_list_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} -qpolynomial_list &qpolynomial_list::operator=(qpolynomial_list obj) { +pw_multi_aff_list &pw_multi_aff_list::operator=(pw_multi_aff_list obj) { std::swap(this->ptr, obj.ptr); return *this; } -qpolynomial_list::~qpolynomial_list() { +pw_multi_aff_list::~pw_multi_aff_list() { if (ptr) - isl_qpolynomial_list_free(ptr); + isl_pw_multi_aff_list_free(ptr); } -__isl_give isl_qpolynomial_list *qpolynomial_list::copy() const & { - return isl_qpolynomial_list_copy(ptr); +__isl_give isl_pw_multi_aff_list *pw_multi_aff_list::copy() const & { + return isl_pw_multi_aff_list_copy(ptr); } -__isl_keep isl_qpolynomial_list *qpolynomial_list::get() const { +__isl_keep isl_pw_multi_aff_list *pw_multi_aff_list::get() const { return ptr; } -__isl_give isl_qpolynomial_list *qpolynomial_list::release() { - isl_qpolynomial_list *tmp = ptr; +__isl_give isl_pw_multi_aff_list *pw_multi_aff_list::release() { + isl_pw_multi_aff_list *tmp = ptr; ptr = nullptr; return tmp; } -bool qpolynomial_list::is_null() const { +bool pw_multi_aff_list::is_null() const { return ptr == nullptr; } - -isl::ctx qpolynomial_list::ctx() const { - return isl::ctx(isl_qpolynomial_list_get_ctx(ptr)); -} - -void qpolynomial_list::dump() const { - isl_qpolynomial_list_dump(get()); -} - - -isl::qpolynomial_list qpolynomial_list::add(isl::qpolynomial el) const -{ - auto res = isl_qpolynomial_list_add(copy(), el.release()); - return manage(res); -} - -isl::qpolynomial_list qpolynomial_list::alloc(isl::ctx ctx, int n) -{ - auto res = isl_qpolynomial_list_alloc(ctx.release(), n); - return manage(res); -} - -isl::qpolynomial_list qpolynomial_list::clear() const -{ - auto res = isl_qpolynomial_list_clear(copy()); - return manage(res); -} - -isl::qpolynomial_list qpolynomial_list::concat(isl::qpolynomial_list list2) const -{ - auto res = isl_qpolynomial_list_concat(copy(), list2.release()); - return manage(res); +isl::ctx pw_multi_aff_list::ctx() const { + return isl::ctx(isl_pw_multi_aff_list_get_ctx(ptr)); } -isl::qpolynomial_list qpolynomial_list::drop(unsigned int first, unsigned int n) const +isl::pw_multi_aff_list pw_multi_aff_list::add(isl::pw_multi_aff el) const { - auto res = isl_qpolynomial_list_drop(copy(), first, n); + auto res = isl_pw_multi_aff_list_add(copy(), el.release()); return manage(res); } -stat qpolynomial_list::foreach(const std::function &fn) const +isl::pw_multi_aff pw_multi_aff_list::at(int index) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_qpolynomial *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_qpolynomial_list_foreach(get(), fn_lambda, &fn_data); + auto res = isl_pw_multi_aff_list_get_at(get(), index); return manage(res); } -isl::qpolynomial_list qpolynomial_list::from_qpolynomial(isl::qpolynomial el) +isl::pw_multi_aff pw_multi_aff_list::get_at(int index) const { - auto res = isl_qpolynomial_list_from_qpolynomial(el.release()); - return manage(res); + return at(index); } -isl::qpolynomial qpolynomial_list::get_at(int index) const +isl::pw_multi_aff_list pw_multi_aff_list::clear() const { - auto res = isl_qpolynomial_list_get_at(get(), index); + auto res = isl_pw_multi_aff_list_clear(copy()); return manage(res); } -isl::qpolynomial qpolynomial_list::get_qpolynomial(int index) const +isl::pw_multi_aff_list pw_multi_aff_list::concat(isl::pw_multi_aff_list list2) const { - auto res = isl_qpolynomial_list_get_qpolynomial(get(), index); + auto res = isl_pw_multi_aff_list_concat(copy(), list2.release()); return manage(res); } -isl::qpolynomial_list qpolynomial_list::insert(unsigned int pos, isl::qpolynomial el) const +isl::pw_multi_aff_list pw_multi_aff_list::drop(unsigned int first, unsigned int n) const { - auto res = isl_qpolynomial_list_insert(copy(), pos, el.release()); + auto res = isl_pw_multi_aff_list_drop(copy(), first, n); return manage(res); } -isl_size qpolynomial_list::n_qpolynomial() const -{ - auto res = isl_qpolynomial_list_n_qpolynomial(get()); - return res; -} - -isl::qpolynomial_list qpolynomial_list::reverse() const +stat pw_multi_aff_list::foreach(const std::function &fn) const { - auto res = isl_qpolynomial_list_reverse(copy()); + struct fn_data { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_pw_multi_aff *arg_0, void *arg_1) -> isl_stat { + auto *data = static_cast(arg_1); + auto ret = (data->func)(manage(arg_0)); + return ret.release(); + }; + auto res = isl_pw_multi_aff_list_foreach(get(), fn_lambda, &fn_data); return manage(res); } -isl::qpolynomial_list qpolynomial_list::set_qpolynomial(int index, isl::qpolynomial el) const +isl::pw_multi_aff_list pw_multi_aff_list::insert(unsigned int pos, isl::pw_multi_aff el) const { - auto res = isl_qpolynomial_list_set_qpolynomial(copy(), index, el.release()); + auto res = isl_pw_multi_aff_list_insert(copy(), pos, el.release()); return manage(res); } -isl_size qpolynomial_list::size() const +class size pw_multi_aff_list::size() const { - auto res = isl_qpolynomial_list_size(get()); - return res; + auto res = isl_pw_multi_aff_list_size(get()); + return manage(res); } -isl::qpolynomial_list qpolynomial_list::swap(unsigned int pos1, unsigned int pos2) const +inline std::ostream &operator<<(std::ostream &os, const pw_multi_aff_list &obj) { - auto res = isl_qpolynomial_list_swap(copy(), pos1, pos2); - return manage(res); + char *str = isl_pw_multi_aff_list_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } // implementations for isl::schedule @@ -14680,7 +17094,6 @@ schedule::schedule(const schedule &obj) ptr = obj.copy(); } - schedule::schedule(__isl_take isl_schedule *ptr) : ptr(ptr) {} @@ -14718,49 +17131,30 @@ bool schedule::is_null() const { return ptr == nullptr; } - isl::ctx schedule::ctx() const { return isl::ctx(isl_schedule_get_ctx(ptr)); } -void schedule::dump() const { - isl_schedule_dump(get()); -} - - isl::schedule schedule::align_params(isl::space space) const { auto res = isl_schedule_align_params(copy(), space.release()); return manage(res); } -isl::schedule schedule::empty(isl::space space) -{ - auto res = isl_schedule_empty(space.release()); - return manage(res); -} - -isl::schedule schedule::from_domain(isl::union_set domain) -{ - auto res = isl_schedule_from_domain(domain.release()); - return manage(res); -} - -isl::union_set schedule::get_domain() const +isl::union_set schedule::domain() const { auto res = isl_schedule_get_domain(get()); return manage(res); } -isl::union_map schedule::get_map() const +isl::union_set schedule::get_domain() const { - auto res = isl_schedule_get_map(get()); - return manage(res); + return domain(); } -isl::schedule_node schedule::get_root() const +isl::schedule schedule::from_domain(isl::union_set domain) { - auto res = isl_schedule_get_root(get()); + auto res = isl_schedule_from_domain(domain.release()); return manage(res); } @@ -14770,18 +17164,6 @@ isl::schedule schedule::gist_domain_params(isl::set context) const return manage(res); } -isl::schedule schedule::insert_context(isl::set context) const -{ - auto res = isl_schedule_insert_context(copy(), context.release()); - return manage(res); -} - -isl::schedule schedule::insert_guard(isl::set guard) const -{ - auto res = isl_schedule_insert_guard(copy(), guard.release()); - return manage(res); -} - isl::schedule schedule::insert_partial_schedule(isl::multi_union_pw_aff partial) const { auto res = isl_schedule_insert_partial_schedule(copy(), partial.release()); @@ -14794,30 +17176,52 @@ isl::schedule schedule::intersect_domain(isl::union_set domain) const return manage(res); } -boolean schedule::plain_is_equal(const isl::schedule &schedule2) const +isl::union_map schedule::map() const { - auto res = isl_schedule_plain_is_equal(get(), schedule2.get()); + auto res = isl_schedule_get_map(get()); return manage(res); } +isl::union_map schedule::get_map() const +{ + return map(); +} + isl::schedule schedule::pullback(isl::union_pw_multi_aff upma) const { auto res = isl_schedule_pullback_union_pw_multi_aff(copy(), upma.release()); return manage(res); } -isl::schedule schedule::reset_user() const +isl::schedule_node schedule::root() const { - auto res = isl_schedule_reset_user(copy()); + auto res = isl_schedule_get_root(get()); return manage(res); } +isl::schedule_node schedule::get_root() const +{ + return root(); +} + isl::schedule schedule::sequence(isl::schedule schedule2) const { auto res = isl_schedule_sequence(copy(), schedule2.release()); return manage(res); } +inline std::ostream &operator<<(std::ostream &os, const schedule &obj) +{ + char *str = isl_schedule_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} + // implementations for isl::schedule_constraints schedule_constraints manage(__isl_take isl_schedule_constraints *ptr) { return schedule_constraints(ptr); @@ -14836,7 +17240,6 @@ schedule_constraints::schedule_constraints(const schedule_constraints &obj) ptr = obj.copy(); } - schedule_constraints::schedule_constraints(__isl_take isl_schedule_constraints *ptr) : ptr(ptr) {} @@ -14874,20 +17277,19 @@ bool schedule_constraints::is_null() const { return ptr == nullptr; } - isl::ctx schedule_constraints::ctx() const { return isl::ctx(isl_schedule_constraints_get_ctx(ptr)); } -void schedule_constraints::dump() const { - isl_schedule_constraints_dump(get()); +isl::union_map schedule_constraints::coincidence() const +{ + auto res = isl_schedule_constraints_get_coincidence(get()); + return manage(res); } - -isl::schedule_constraints schedule_constraints::apply(isl::union_map umap) const +isl::union_map schedule_constraints::get_coincidence() const { - auto res = isl_schedule_constraints_apply(copy(), umap.release()); - return manage(res); + return coincidence(); } isl::schedule schedule_constraints::compute_schedule() const @@ -14896,46 +17298,48 @@ isl::schedule schedule_constraints::compute_schedule() const return manage(res); } -isl::union_map schedule_constraints::get_coincidence() const +isl::union_map schedule_constraints::conditional_validity() const { - auto res = isl_schedule_constraints_get_coincidence(get()); + auto res = isl_schedule_constraints_get_conditional_validity(get()); return manage(res); } isl::union_map schedule_constraints::get_conditional_validity() const { - auto res = isl_schedule_constraints_get_conditional_validity(get()); - return manage(res); + return conditional_validity(); } -isl::union_map schedule_constraints::get_conditional_validity_condition() const +isl::union_map schedule_constraints::conditional_validity_condition() const { auto res = isl_schedule_constraints_get_conditional_validity_condition(get()); return manage(res); } -isl::set schedule_constraints::get_context() const +isl::union_map schedule_constraints::get_conditional_validity_condition() const +{ + return conditional_validity_condition(); +} + +isl::set schedule_constraints::context() const { auto res = isl_schedule_constraints_get_context(get()); return manage(res); } -isl::union_set schedule_constraints::get_domain() const +isl::set schedule_constraints::get_context() const { - auto res = isl_schedule_constraints_get_domain(get()); - return manage(res); + return context(); } -isl::union_map schedule_constraints::get_proximity() const +isl::union_set schedule_constraints::domain() const { - auto res = isl_schedule_constraints_get_proximity(get()); + auto res = isl_schedule_constraints_get_domain(get()); return manage(res); } -isl::union_map schedule_constraints::get_validity() const +isl::union_set schedule_constraints::get_domain() const { - auto res = isl_schedule_constraints_get_validity(get()); - return manage(res); + return domain(); } isl::schedule_constraints schedule_constraints::on_domain(isl::union_set domain) @@ -14944,6 +17348,17 @@ isl::schedule_constraints schedule_constraints::on_domain(isl::union_set domain) return manage(res); } +isl::union_map schedule_constraints::proximity() const +{ + auto res = isl_schedule_constraints_get_proximity(get()); + return manage(res); +} + +isl::union_map schedule_constraints::get_proximity() const +{ + return proximity(); +} + isl::schedule_constraints schedule_constraints::set_coincidence(isl::union_map coincidence) const { auto res = isl_schedule_constraints_set_coincidence(copy(), coincidence.release()); @@ -14974,6 +17389,29 @@ isl::schedule_constraints schedule_constraints::set_validity(isl::union_map vali return manage(res); } +isl::union_map schedule_constraints::validity() const +{ + auto res = isl_schedule_constraints_get_validity(get()); + return manage(res); +} + +isl::union_map schedule_constraints::get_validity() const +{ + return validity(); +} + +inline std::ostream &operator<<(std::ostream &os, const schedule_constraints &obj) +{ + char *str = isl_schedule_constraints_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} + // implementations for isl::schedule_node schedule_node manage(__isl_take isl_schedule_node *ptr) { return schedule_node(ptr); @@ -14992,11 +17430,9 @@ schedule_node::schedule_node(const schedule_node &obj) ptr = obj.copy(); } - schedule_node::schedule_node(__isl_take isl_schedule_node *ptr) : ptr(ptr) {} - schedule_node &schedule_node::operator=(schedule_node obj) { std::swap(this->ptr, obj.ptr); return *this; @@ -15025,91 +17461,86 @@ bool schedule_node::is_null() const { return ptr == nullptr; } - -isl::ctx schedule_node::ctx() const { - return isl::ctx(isl_schedule_node_get_ctx(ptr)); -} - -void schedule_node::dump() const { - isl_schedule_node_dump(get()); -} - - -isl::schedule_node schedule_node::align_params(isl::space space) const +template +boolean schedule_node::isa_type(T subtype) const { - auto res = isl_schedule_node_align_params(copy(), space.release()); - return manage(res); + if (is_null()) + return boolean(); + return isl_schedule_node_get_type(get()) == subtype; } - -isl::schedule_node schedule_node::ancestor(int generation) const +template +boolean schedule_node::isa() const { - auto res = isl_schedule_node_ancestor(copy(), generation); - return manage(res); + return isa_type(T::type); } - -boolean schedule_node::band_member_get_coincident(int pos) const +template +T schedule_node::as() const { - auto res = isl_schedule_node_band_member_get_coincident(get(), pos); - return manage(res); + if (isa().is_false()) + isl_die(ctx().get(), isl_error_invalid, "not an object of the requested subtype", return T()); + return T(copy()); } -isl::schedule_node schedule_node::band_member_set_coincident(int pos, int coincident) const -{ - auto res = isl_schedule_node_band_member_set_coincident(copy(), pos, coincident); - return manage(res); +isl::ctx schedule_node::ctx() const { + return isl::ctx(isl_schedule_node_get_ctx(ptr)); } -isl::schedule_node schedule_node::band_set_ast_build_options(isl::union_set options) const +isl::schedule_node schedule_node::ancestor(int generation) const { - auto res = isl_schedule_node_band_set_ast_build_options(copy(), options.release()); + auto res = isl_schedule_node_ancestor(copy(), generation); return manage(res); } -isl::schedule_node schedule_node::child(int pos) const +class size schedule_node::ancestor_child_position(const isl::schedule_node &ancestor) const { - auto res = isl_schedule_node_child(copy(), pos); + auto res = isl_schedule_node_get_ancestor_child_position(get(), ancestor.get()); return manage(res); } -isl::set schedule_node::context_get_context() const +class size schedule_node::get_ancestor_child_position(const isl::schedule_node &ancestor) const { - auto res = isl_schedule_node_context_get_context(get()); - return manage(res); + return ancestor_child_position(ancestor); } -isl::schedule_node schedule_node::cut() const +isl::schedule_node schedule_node::child(int pos) const { - auto res = isl_schedule_node_cut(copy()); + auto res = isl_schedule_node_child(copy(), pos); return manage(res); } -isl::union_set schedule_node::domain_get_domain() const +class size schedule_node::child_position() const { - auto res = isl_schedule_node_domain_get_domain(get()); + auto res = isl_schedule_node_get_child_position(get()); return manage(res); } -isl::union_pw_multi_aff schedule_node::expansion_get_contraction() const +class size schedule_node::get_child_position() const { - auto res = isl_schedule_node_expansion_get_contraction(get()); - return manage(res); + return child_position(); } -isl::union_map schedule_node::expansion_get_expansion() const +isl::union_set schedule_node::domain() const { - auto res = isl_schedule_node_expansion_get_expansion(get()); + auto res = isl_schedule_node_get_domain(get()); return manage(res); } -isl::union_map schedule_node::extension_get_extension() const +isl::union_set schedule_node::get_domain() const { - auto res = isl_schedule_node_extension_get_extension(get()); - return manage(res); + return domain(); } -isl::union_set schedule_node::filter_get_filter() const +boolean schedule_node::every_descendant(const std::function &test) const { - auto res = isl_schedule_node_filter_get_filter(get()); + struct test_data { + std::function func; + } test_data = { test }; + auto test_lambda = [](isl_schedule_node *arg_0, void *arg_1) -> isl_bool { + auto *data = static_cast(arg_1); + auto ret = (data->func)(manage_copy(arg_0)); + return ret.release(); + }; + auto res = isl_schedule_node_every_descendant(get(), test_lambda, &test_data); return manage(res); } @@ -15119,125 +17550,43 @@ isl::schedule_node schedule_node::first_child() const return manage(res); } -stat schedule_node::foreach_ancestor_top_down(const std::function &fn) const +stat schedule_node::foreach_ancestor_top_down(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; + std::function func; + } fn_data = { fn }; auto fn_lambda = [](isl_schedule_node *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage_copy(arg_0)); + auto ret = (data->func)(manage_copy(arg_0)); return ret.release(); }; auto res = isl_schedule_node_foreach_ancestor_top_down(get(), fn_lambda, &fn_data); return manage(res); } -isl::schedule_node schedule_node::from_domain(isl::union_set domain) -{ - auto res = isl_schedule_node_from_domain(domain.release()); - return manage(res); -} - -isl::schedule_node schedule_node::from_extension(isl::union_map extension) -{ - auto res = isl_schedule_node_from_extension(extension.release()); - return manage(res); -} - -isl_size schedule_node::get_ancestor_child_position(const isl::schedule_node &ancestor) const -{ - auto res = isl_schedule_node_get_ancestor_child_position(get(), ancestor.get()); - return res; -} - -isl::schedule_node schedule_node::get_child(int pos) const -{ - auto res = isl_schedule_node_get_child(get(), pos); - return manage(res); -} - -isl_size schedule_node::get_child_position() const -{ - auto res = isl_schedule_node_get_child_position(get()); - return res; -} - -isl::union_set schedule_node::get_domain() const -{ - auto res = isl_schedule_node_get_domain(get()); - return manage(res); -} - -isl::multi_union_pw_aff schedule_node::get_prefix_schedule_multi_union_pw_aff() const -{ - auto res = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(get()); - return manage(res); -} - -isl::union_map schedule_node::get_prefix_schedule_relation() const -{ - auto res = isl_schedule_node_get_prefix_schedule_relation(get()); - return manage(res); -} - -isl::union_map schedule_node::get_prefix_schedule_union_map() const -{ - auto res = isl_schedule_node_get_prefix_schedule_union_map(get()); - return manage(res); -} - -isl::union_pw_multi_aff schedule_node::get_prefix_schedule_union_pw_multi_aff() const -{ - auto res = isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(get()); - return manage(res); -} - -isl::schedule schedule_node::get_schedule() const -{ - auto res = isl_schedule_node_get_schedule(get()); - return manage(res); -} - -isl_size schedule_node::get_schedule_depth() const -{ - auto res = isl_schedule_node_get_schedule_depth(get()); - return res; -} - -isl::schedule_node schedule_node::get_shared_ancestor(const isl::schedule_node &node2) const -{ - auto res = isl_schedule_node_get_shared_ancestor(get(), node2.get()); - return manage(res); -} - -isl::union_pw_multi_aff schedule_node::get_subtree_contraction() const +stat schedule_node::foreach_descendant_top_down(const std::function &fn) const { - auto res = isl_schedule_node_get_subtree_contraction(get()); - return manage(res); -} - -isl::union_map schedule_node::get_subtree_expansion() const -{ - auto res = isl_schedule_node_get_subtree_expansion(get()); + struct fn_data { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_schedule_node *arg_0, void *arg_1) -> isl_bool { + auto *data = static_cast(arg_1); + auto ret = (data->func)(manage_copy(arg_0)); + return ret.release(); + }; + auto res = isl_schedule_node_foreach_descendant_top_down(get(), fn_lambda, &fn_data); return manage(res); } -isl::union_map schedule_node::get_subtree_schedule_union_map() const +isl::schedule_node schedule_node::from_domain(isl::union_set domain) { - auto res = isl_schedule_node_get_subtree_schedule_union_map(get()); + auto res = isl_schedule_node_from_domain(domain.release()); return manage(res); } -isl_size schedule_node::get_tree_depth() const -{ - auto res = isl_schedule_node_get_tree_depth(get()); - return res; -} - -isl::union_set schedule_node::get_universe_domain() const +isl::schedule_node schedule_node::from_extension(isl::union_map extension) { - auto res = isl_schedule_node_get_universe_domain(get()); + auto res = isl_schedule_node_from_extension(extension.release()); return manage(res); } @@ -15253,18 +17602,6 @@ isl::schedule_node schedule_node::graft_before(isl::schedule_node graft) const return manage(res); } -isl::schedule_node schedule_node::group(isl::id group_id) const -{ - auto res = isl_schedule_node_group(copy(), group_id.release()); - return manage(res); -} - -isl::set schedule_node::guard_get_guard() const -{ - auto res = isl_schedule_node_guard_get_guard(get()); - return manage(res); -} - boolean schedule_node::has_children() const { auto res = isl_schedule_node_has_children(get()); @@ -15313,6 +17650,11 @@ isl::schedule_node schedule_node::insert_mark(isl::id mark) const return manage(res); } +isl::schedule_node schedule_node::insert_mark(const std::string &mark) const +{ + return this->insert_mark(isl::id(ctx(), mark)); +} + isl::schedule_node schedule_node::insert_partial_schedule(isl::multi_union_pw_aff schedule) const { auto res = isl_schedule_node_insert_partial_schedule(copy(), schedule.release()); @@ -15343,16 +17685,24 @@ boolean schedule_node::is_subtree_anchored() const return manage(res); } -isl::id schedule_node::mark_get_id() const +isl::schedule_node schedule_node::map_descendant_bottom_up(const std::function &fn) const { - auto res = isl_schedule_node_mark_get_id(get()); + struct fn_data { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_schedule_node *arg_0, void *arg_1) -> isl_schedule_node * { + auto *data = static_cast(arg_1); + auto ret = (data->func)(manage(arg_0)); + return ret.release(); + }; + auto res = isl_schedule_node_map_descendant_bottom_up(copy(), fn_lambda, &fn_data); return manage(res); } -isl_size schedule_node::n_children() const +class size schedule_node::n_children() const { auto res = isl_schedule_node_n_children(get()); - return res; + return manage(res); } isl::schedule_node schedule_node::next_sibling() const @@ -15379,4255 +17729,4142 @@ isl::schedule_node schedule_node::parent() const return manage(res); } -isl::schedule_node schedule_node::previous_sibling() const +isl::multi_union_pw_aff schedule_node::prefix_schedule_multi_union_pw_aff() const { - auto res = isl_schedule_node_previous_sibling(copy()); + auto res = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(get()); return manage(res); } -isl::schedule_node schedule_node::reset_user() const +isl::multi_union_pw_aff schedule_node::get_prefix_schedule_multi_union_pw_aff() const { - auto res = isl_schedule_node_reset_user(copy()); - return manage(res); + return prefix_schedule_multi_union_pw_aff(); } -isl::schedule_node schedule_node::root() const +isl::union_map schedule_node::prefix_schedule_relation() const { - auto res = isl_schedule_node_root(copy()); + auto res = isl_schedule_node_get_prefix_schedule_relation(get()); return manage(res); } -isl::schedule_node schedule_node::sequence_splice_child(int pos) const +isl::union_map schedule_node::get_prefix_schedule_relation() const { - auto res = isl_schedule_node_sequence_splice_child(copy(), pos); - return manage(res); -} - -// implementations for isl::set -set manage(__isl_take isl_set *ptr) { - return set(ptr); -} -set manage_copy(__isl_keep isl_set *ptr) { - ptr = isl_set_copy(ptr); - return set(ptr); + return prefix_schedule_relation(); } -set::set() - : ptr(nullptr) {} - -set::set(const set &obj) - : ptr(nullptr) +isl::union_map schedule_node::prefix_schedule_union_map() const { - ptr = obj.copy(); + auto res = isl_schedule_node_get_prefix_schedule_union_map(get()); + return manage(res); } - -set::set(__isl_take isl_set *ptr) - : ptr(ptr) {} - -set::set(isl::basic_set bset) -{ - auto res = isl_set_from_basic_set(bset.release()); - ptr = res; -} -set::set(isl::point pnt) +isl::union_map schedule_node::get_prefix_schedule_union_map() const { - auto res = isl_set_from_point(pnt.release()); - ptr = res; + return prefix_schedule_union_map(); } -set::set(isl::union_set uset) + +isl::union_pw_multi_aff schedule_node::prefix_schedule_union_pw_multi_aff() const { - auto res = isl_set_from_union_set(uset.release()); - ptr = res; + auto res = isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(get()); + return manage(res); } -set::set(isl::ctx ctx, const std::string &str) + +isl::union_pw_multi_aff schedule_node::get_prefix_schedule_union_pw_multi_aff() const { - auto res = isl_set_read_from_str(ctx.release(), str.c_str()); - ptr = res; + return prefix_schedule_union_pw_multi_aff(); } -set &set::operator=(set obj) { - std::swap(this->ptr, obj.ptr); - return *this; +isl::schedule_node schedule_node::previous_sibling() const +{ + auto res = isl_schedule_node_previous_sibling(copy()); + return manage(res); } -set::~set() { - if (ptr) - isl_set_free(ptr); +isl::schedule_node schedule_node::root() const +{ + auto res = isl_schedule_node_root(copy()); + return manage(res); } -__isl_give isl_set *set::copy() const & { - return isl_set_copy(ptr); +isl::schedule schedule_node::schedule() const +{ + auto res = isl_schedule_node_get_schedule(get()); + return manage(res); } -__isl_keep isl_set *set::get() const { - return ptr; +isl::schedule schedule_node::get_schedule() const +{ + return schedule(); } -__isl_give isl_set *set::release() { - isl_set *tmp = ptr; - ptr = nullptr; - return tmp; +class size schedule_node::schedule_depth() const +{ + auto res = isl_schedule_node_get_schedule_depth(get()); + return manage(res); } -bool set::is_null() const { - return ptr == nullptr; +class size schedule_node::get_schedule_depth() const +{ + return schedule_depth(); } - -isl::ctx set::ctx() const { - return isl::ctx(isl_set_get_ctx(ptr)); +isl::schedule_node schedule_node::shared_ancestor(const isl::schedule_node &node2) const +{ + auto res = isl_schedule_node_get_shared_ancestor(get(), node2.get()); + return manage(res); } -void set::dump() const { - isl_set_dump(get()); +isl::schedule_node schedule_node::get_shared_ancestor(const isl::schedule_node &node2) const +{ + return shared_ancestor(node2); } - -isl::set set::add_constraint(isl::constraint constraint) const +class size schedule_node::tree_depth() const { - auto res = isl_set_add_constraint(copy(), constraint.release()); + auto res = isl_schedule_node_get_tree_depth(get()); return manage(res); } -isl::set set::add_dims(isl::dim type, unsigned int n) const +class size schedule_node::get_tree_depth() const { - auto res = isl_set_add_dims(copy(), static_cast(type), n); - return manage(res); + return tree_depth(); } -isl::basic_set set::affine_hull() const +isl::union_set schedule_node::universe_domain() const { - auto res = isl_set_affine_hull(copy()); + auto res = isl_schedule_node_get_universe_domain(get()); return manage(res); } -isl::set set::align_params(isl::space model) const +isl::union_set schedule_node::get_universe_domain() const { - auto res = isl_set_align_params(copy(), model.release()); - return manage(res); + return universe_domain(); } -isl::set set::apply(isl::map map) const +inline std::ostream &operator<<(std::ostream &os, const schedule_node &obj) { - auto res = isl_set_apply(copy(), map.release()); - return manage(res); + char *str = isl_schedule_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::set set::bind(isl::multi_id tuple) const +// implementations for isl::schedule_node_band +schedule_node_band::schedule_node_band() + : schedule_node() {} + +schedule_node_band::schedule_node_band(const schedule_node_band &obj) + : schedule_node(obj) { - auto res = isl_set_bind(copy(), tuple.release()); - return manage(res); } -isl::basic_set set::bounded_simple_hull() const -{ - auto res = isl_set_bounded_simple_hull(copy()); - return manage(res); +schedule_node_band::schedule_node_band(__isl_take isl_schedule_node *ptr) + : schedule_node(ptr) {} + +schedule_node_band &schedule_node_band::operator=(schedule_node_band obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::set set::box_from_points(isl::point pnt1, isl::point pnt2) -{ - auto res = isl_set_box_from_points(pnt1.release(), pnt2.release()); - return manage(res); +isl::ctx schedule_node_band::ctx() const { + return isl::ctx(isl_schedule_node_get_ctx(ptr)); } -isl::set set::coalesce() const +isl::union_set schedule_node_band::ast_build_options() const { - auto res = isl_set_coalesce(copy()); + auto res = isl_schedule_node_band_get_ast_build_options(get()); return manage(res); } -isl::basic_set set::coefficients() const +isl::union_set schedule_node_band::get_ast_build_options() const { - auto res = isl_set_coefficients(copy()); - return manage(res); + return ast_build_options(); } -isl::set set::complement() const +isl::set schedule_node_band::ast_isolate_option() const { - auto res = isl_set_complement(copy()); + auto res = isl_schedule_node_band_get_ast_isolate_option(get()); return manage(res); } -isl::basic_set set::convex_hull() const +isl::set schedule_node_band::get_ast_isolate_option() const { - auto res = isl_set_convex_hull(copy()); - return manage(res); + return ast_isolate_option(); } -isl::val set::count_val() const +boolean schedule_node_band::member_get_coincident(int pos) const { - auto res = isl_set_count_val(get()); + auto res = isl_schedule_node_band_member_get_coincident(get(), pos); return manage(res); } -isl::set set::detect_equalities() const +schedule_node_band schedule_node_band::member_set_coincident(int pos, int coincident) const { - auto res = isl_set_detect_equalities(copy()); - return manage(res); + auto res = isl_schedule_node_band_member_set_coincident(copy(), pos, coincident); + return manage(res).as(); } -isl_size set::dim(isl::dim type) const +schedule_node_band schedule_node_band::mod(isl::multi_val mv) const { - auto res = isl_set_dim(get(), static_cast(type)); - return res; + auto res = isl_schedule_node_band_mod(copy(), mv.release()); + return manage(res).as(); } -boolean set::dim_has_any_lower_bound(isl::dim type, unsigned int pos) const +class size schedule_node_band::n_member() const { - auto res = isl_set_dim_has_any_lower_bound(get(), static_cast(type), pos); + auto res = isl_schedule_node_band_n_member(get()); return manage(res); } -boolean set::dim_has_any_upper_bound(isl::dim type, unsigned int pos) const +isl::multi_union_pw_aff schedule_node_band::partial_schedule() const { - auto res = isl_set_dim_has_any_upper_bound(get(), static_cast(type), pos); + auto res = isl_schedule_node_band_get_partial_schedule(get()); return manage(res); } -boolean set::dim_has_lower_bound(isl::dim type, unsigned int pos) const +isl::multi_union_pw_aff schedule_node_band::get_partial_schedule() const { - auto res = isl_set_dim_has_lower_bound(get(), static_cast(type), pos); - return manage(res); + return partial_schedule(); } -boolean set::dim_has_upper_bound(isl::dim type, unsigned int pos) const +boolean schedule_node_band::permutable() const { - auto res = isl_set_dim_has_upper_bound(get(), static_cast(type), pos); + auto res = isl_schedule_node_band_get_permutable(get()); return manage(res); } -boolean set::dim_is_bounded(isl::dim type, unsigned int pos) const +boolean schedule_node_band::get_permutable() const { - auto res = isl_set_dim_is_bounded(get(), static_cast(type), pos); - return manage(res); + return permutable(); } -isl::pw_aff set::dim_max(int pos) const +schedule_node_band schedule_node_band::scale(isl::multi_val mv) const { - auto res = isl_set_dim_max(copy(), pos); - return manage(res); + auto res = isl_schedule_node_band_scale(copy(), mv.release()); + return manage(res).as(); } -isl::val set::dim_max_val(int pos) const +schedule_node_band schedule_node_band::scale_down(isl::multi_val mv) const { - auto res = isl_set_dim_max_val(copy(), pos); - return manage(res); + auto res = isl_schedule_node_band_scale_down(copy(), mv.release()); + return manage(res).as(); } -isl::pw_aff set::dim_min(int pos) const +schedule_node_band schedule_node_band::set_ast_build_options(isl::union_set options) const { - auto res = isl_set_dim_min(copy(), pos); - return manage(res); + auto res = isl_schedule_node_band_set_ast_build_options(copy(), options.release()); + return manage(res).as(); } -isl::val set::dim_min_val(int pos) const +schedule_node_band schedule_node_band::set_permutable(int permutable) const { - auto res = isl_set_dim_min_val(copy(), pos); - return manage(res); + auto res = isl_schedule_node_band_set_permutable(copy(), permutable); + return manage(res).as(); } -isl::set set::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const +schedule_node_band schedule_node_band::shift(isl::multi_union_pw_aff shift) const { - auto res = isl_set_drop_constraints_involving_dims(copy(), static_cast(type), first, n); - return manage(res); + auto res = isl_schedule_node_band_shift(copy(), shift.release()); + return manage(res).as(); } -isl::set set::drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const +schedule_node_band schedule_node_band::split(int pos) const { - auto res = isl_set_drop_constraints_not_involving_dims(copy(), static_cast(type), first, n); - return manage(res); + auto res = isl_schedule_node_band_split(copy(), pos); + return manage(res).as(); } -isl::set set::drop_unused_params() const +schedule_node_band schedule_node_band::tile(isl::multi_val sizes) const { - auto res = isl_set_drop_unused_params(copy()); - return manage(res); + auto res = isl_schedule_node_band_tile(copy(), sizes.release()); + return manage(res).as(); } -isl::set set::eliminate(isl::dim type, unsigned int first, unsigned int n) const +schedule_node_band schedule_node_band::member_set_ast_loop_default(int pos) const { - auto res = isl_set_eliminate(copy(), static_cast(type), first, n); - return manage(res); + auto res = isl_schedule_node_band_member_set_ast_loop_type(copy(), pos, isl_ast_loop_default); + return manage(res).as(); } -isl::set set::empty(isl::space space) +schedule_node_band schedule_node_band::member_set_ast_loop_atomic(int pos) const { - auto res = isl_set_empty(space.release()); - return manage(res); + auto res = isl_schedule_node_band_member_set_ast_loop_type(copy(), pos, isl_ast_loop_atomic); + return manage(res).as(); } -isl::set set::equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const +schedule_node_band schedule_node_band::member_set_ast_loop_unroll(int pos) const { - auto res = isl_set_equate(copy(), static_cast(type1), pos1, static_cast(type2), pos2); - return manage(res); + auto res = isl_schedule_node_band_member_set_ast_loop_type(copy(), pos, isl_ast_loop_unroll); + return manage(res).as(); } -int set::find_dim_by_id(isl::dim type, const isl::id &id) const +schedule_node_band schedule_node_band::member_set_ast_loop_separate(int pos) const { - auto res = isl_set_find_dim_by_id(get(), static_cast(type), id.get()); - return res; + auto res = isl_schedule_node_band_member_set_ast_loop_type(copy(), pos, isl_ast_loop_separate); + return manage(res).as(); } -int set::find_dim_by_name(isl::dim type, const std::string &name) const +inline std::ostream &operator<<(std::ostream &os, const schedule_node_band &obj) { - auto res = isl_set_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + char *str = isl_schedule_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::set set::fix_si(isl::dim type, unsigned int pos, int value) const +// implementations for isl::schedule_node_context +schedule_node_context::schedule_node_context() + : schedule_node() {} + +schedule_node_context::schedule_node_context(const schedule_node_context &obj) + : schedule_node(obj) { - auto res = isl_set_fix_si(copy(), static_cast(type), pos, value); - return manage(res); } -isl::set set::fix_val(isl::dim type, unsigned int pos, isl::val v) const -{ - auto res = isl_set_fix_val(copy(), static_cast(type), pos, v.release()); - return manage(res); +schedule_node_context::schedule_node_context(__isl_take isl_schedule_node *ptr) + : schedule_node(ptr) {} + +schedule_node_context &schedule_node_context::operator=(schedule_node_context obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::set set::flat_product(isl::set set2) const -{ - auto res = isl_set_flat_product(copy(), set2.release()); - return manage(res); +isl::ctx schedule_node_context::ctx() const { + return isl::ctx(isl_schedule_node_get_ctx(ptr)); } -isl::set set::flatten() const +isl::set schedule_node_context::context() const { - auto res = isl_set_flatten(copy()); + auto res = isl_schedule_node_context_get_context(get()); return manage(res); } -isl::map set::flatten_map() const +isl::set schedule_node_context::get_context() const { - auto res = isl_set_flatten_map(copy()); - return manage(res); + return context(); } -int set::follows_at(const isl::set &set2, int pos) const +inline std::ostream &operator<<(std::ostream &os, const schedule_node_context &obj) { - auto res = isl_set_follows_at(get(), set2.get(), pos); - return res; + char *str = isl_schedule_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -stat set::foreach_basic_set(const std::function &fn) const +// implementations for isl::schedule_node_domain +schedule_node_domain::schedule_node_domain() + : schedule_node() {} + +schedule_node_domain::schedule_node_domain(const schedule_node_domain &obj) + : schedule_node(obj) { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_basic_set *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_set_foreach_basic_set(get(), fn_lambda, &fn_data); - return manage(res); } -stat set::foreach_point(const std::function &fn) const -{ - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_point *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_set_foreach_point(get(), fn_lambda, &fn_data); - return manage(res); +schedule_node_domain::schedule_node_domain(__isl_take isl_schedule_node *ptr) + : schedule_node(ptr) {} + +schedule_node_domain &schedule_node_domain::operator=(schedule_node_domain obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::set set::from_multi_aff(isl::multi_aff ma) -{ - auto res = isl_set_from_multi_aff(ma.release()); - return manage(res); +isl::ctx schedule_node_domain::ctx() const { + return isl::ctx(isl_schedule_node_get_ctx(ptr)); } -isl::set set::from_multi_pw_aff(isl::multi_pw_aff mpa) +isl::union_set schedule_node_domain::domain() const { - auto res = isl_set_from_multi_pw_aff(mpa.release()); + auto res = isl_schedule_node_domain_get_domain(get()); return manage(res); } -isl::set set::from_params() const +isl::union_set schedule_node_domain::get_domain() const { - auto res = isl_set_from_params(copy()); - return manage(res); + return domain(); } -isl::set set::from_pw_aff(isl::pw_aff pwaff) +inline std::ostream &operator<<(std::ostream &os, const schedule_node_domain &obj) { - auto res = isl_set_from_pw_aff(pwaff.release()); - return manage(res); + char *str = isl_schedule_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::set set::from_pw_multi_aff(isl::pw_multi_aff pma) +// implementations for isl::schedule_node_expansion +schedule_node_expansion::schedule_node_expansion() + : schedule_node() {} + +schedule_node_expansion::schedule_node_expansion(const schedule_node_expansion &obj) + : schedule_node(obj) { - auto res = isl_set_from_pw_multi_aff(pma.release()); - return manage(res); } -isl::basic_set_list set::get_basic_set_list() const -{ - auto res = isl_set_get_basic_set_list(get()); - return manage(res); +schedule_node_expansion::schedule_node_expansion(__isl_take isl_schedule_node *ptr) + : schedule_node(ptr) {} + +schedule_node_expansion &schedule_node_expansion::operator=(schedule_node_expansion obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +isl::ctx schedule_node_expansion::ctx() const { + return isl::ctx(isl_schedule_node_get_ctx(ptr)); } -isl::id set::get_dim_id(isl::dim type, unsigned int pos) const +isl::union_pw_multi_aff schedule_node_expansion::contraction() const { - auto res = isl_set_get_dim_id(get(), static_cast(type), pos); + auto res = isl_schedule_node_expansion_get_contraction(get()); return manage(res); } -std::string set::get_dim_name(isl::dim type, unsigned int pos) const +isl::union_pw_multi_aff schedule_node_expansion::get_contraction() const { - auto res = isl_set_get_dim_name(get(), static_cast(type), pos); - std::string tmp(res); - return tmp; + return contraction(); } -isl::multi_val set::get_plain_multi_val_if_fixed() const +isl::union_map schedule_node_expansion::expansion() const { - auto res = isl_set_get_plain_multi_val_if_fixed(get()); + auto res = isl_schedule_node_expansion_get_expansion(get()); return manage(res); } -isl::fixed_box set::get_simple_fixed_box_hull() const +isl::union_map schedule_node_expansion::get_expansion() const { - auto res = isl_set_get_simple_fixed_box_hull(get()); - return manage(res); + return expansion(); } -isl::space set::get_space() const +inline std::ostream &operator<<(std::ostream &os, const schedule_node_expansion &obj) { - auto res = isl_set_get_space(get()); - return manage(res); + char *str = isl_schedule_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::val set::get_stride(int pos) const -{ - auto res = isl_set_get_stride(get(), pos); - return manage(res); -} +// implementations for isl::schedule_node_extension +schedule_node_extension::schedule_node_extension() + : schedule_node() {} -isl::id set::get_tuple_id() const +schedule_node_extension::schedule_node_extension(const schedule_node_extension &obj) + : schedule_node(obj) { - auto res = isl_set_get_tuple_id(get()); - return manage(res); } -std::string set::get_tuple_name() const -{ - auto res = isl_set_get_tuple_name(get()); - std::string tmp(res); - return tmp; +schedule_node_extension::schedule_node_extension(__isl_take isl_schedule_node *ptr) + : schedule_node(ptr) {} + +schedule_node_extension &schedule_node_extension::operator=(schedule_node_extension obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::set set::gist(isl::set context) const -{ - auto res = isl_set_gist(copy(), context.release()); - return manage(res); +isl::ctx schedule_node_extension::ctx() const { + return isl::ctx(isl_schedule_node_get_ctx(ptr)); } -isl::set set::gist_basic_set(isl::basic_set context) const +isl::union_map schedule_node_extension::extension() const { - auto res = isl_set_gist_basic_set(copy(), context.release()); + auto res = isl_schedule_node_extension_get_extension(get()); return manage(res); } -isl::set set::gist_params(isl::set context) const +isl::union_map schedule_node_extension::get_extension() const { - auto res = isl_set_gist_params(copy(), context.release()); - return manage(res); + return extension(); } -boolean set::has_dim_id(isl::dim type, unsigned int pos) const +inline std::ostream &operator<<(std::ostream &os, const schedule_node_extension &obj) { - auto res = isl_set_has_dim_id(get(), static_cast(type), pos); - return manage(res); + char *str = isl_schedule_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -boolean set::has_dim_name(isl::dim type, unsigned int pos) const +// implementations for isl::schedule_node_filter +schedule_node_filter::schedule_node_filter() + : schedule_node() {} + +schedule_node_filter::schedule_node_filter(const schedule_node_filter &obj) + : schedule_node(obj) { - auto res = isl_set_has_dim_name(get(), static_cast(type), pos); - return manage(res); } -boolean set::has_equal_space(const isl::set &set2) const -{ - auto res = isl_set_has_equal_space(get(), set2.get()); - return manage(res); +schedule_node_filter::schedule_node_filter(__isl_take isl_schedule_node *ptr) + : schedule_node(ptr) {} + +schedule_node_filter &schedule_node_filter::operator=(schedule_node_filter obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -boolean set::has_tuple_id() const -{ - auto res = isl_set_has_tuple_id(get()); - return manage(res); +isl::ctx schedule_node_filter::ctx() const { + return isl::ctx(isl_schedule_node_get_ctx(ptr)); } -boolean set::has_tuple_name() const +isl::union_set schedule_node_filter::filter() const { - auto res = isl_set_has_tuple_name(get()); + auto res = isl_schedule_node_filter_get_filter(get()); return manage(res); } -isl::map set::identity() const +isl::union_set schedule_node_filter::get_filter() const { - auto res = isl_set_identity(copy()); - return manage(res); + return filter(); } -isl::pw_aff set::indicator_function() const +inline std::ostream &operator<<(std::ostream &os, const schedule_node_filter &obj) { - auto res = isl_set_indicator_function(copy()); - return manage(res); + char *str = isl_schedule_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::set set::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const +// implementations for isl::schedule_node_guard +schedule_node_guard::schedule_node_guard() + : schedule_node() {} + +schedule_node_guard::schedule_node_guard(const schedule_node_guard &obj) + : schedule_node(obj) { - auto res = isl_set_insert_dims(copy(), static_cast(type), pos, n); - return manage(res); } -isl::map set::insert_domain(isl::space domain) const -{ - auto res = isl_set_insert_domain(copy(), domain.release()); - return manage(res); +schedule_node_guard::schedule_node_guard(__isl_take isl_schedule_node *ptr) + : schedule_node(ptr) {} + +schedule_node_guard &schedule_node_guard::operator=(schedule_node_guard obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::set set::intersect(isl::set set2) const -{ - auto res = isl_set_intersect(copy(), set2.release()); - return manage(res); +isl::ctx schedule_node_guard::ctx() const { + return isl::ctx(isl_schedule_node_get_ctx(ptr)); } -isl::set set::intersect_factor_domain(isl::set domain) const +isl::set schedule_node_guard::guard() const { - auto res = isl_set_intersect_factor_domain(copy(), domain.release()); + auto res = isl_schedule_node_guard_get_guard(get()); return manage(res); } -isl::set set::intersect_factor_range(isl::set range) const +isl::set schedule_node_guard::get_guard() const { - auto res = isl_set_intersect_factor_range(copy(), range.release()); - return manage(res); + return guard(); } -isl::set set::intersect_params(isl::set params) const +inline std::ostream &operator<<(std::ostream &os, const schedule_node_guard &obj) { - auto res = isl_set_intersect_params(copy(), params.release()); - return manage(res); + char *str = isl_schedule_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -boolean set::involves_dims(isl::dim type, unsigned int first, unsigned int n) const +// implementations for isl::schedule_node_leaf +schedule_node_leaf::schedule_node_leaf() + : schedule_node() {} + +schedule_node_leaf::schedule_node_leaf(const schedule_node_leaf &obj) + : schedule_node(obj) { - auto res = isl_set_involves_dims(get(), static_cast(type), first, n); - return manage(res); } -boolean set::involves_locals() const -{ - auto res = isl_set_involves_locals(get()); - return manage(res); +schedule_node_leaf::schedule_node_leaf(__isl_take isl_schedule_node *ptr) + : schedule_node(ptr) {} + +schedule_node_leaf &schedule_node_leaf::operator=(schedule_node_leaf obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -boolean set::is_bounded() const -{ - auto res = isl_set_is_bounded(get()); - return manage(res); +isl::ctx schedule_node_leaf::ctx() const { + return isl::ctx(isl_schedule_node_get_ctx(ptr)); } -boolean set::is_box() const +inline std::ostream &operator<<(std::ostream &os, const schedule_node_leaf &obj) { - auto res = isl_set_is_box(get()); - return manage(res); + char *str = isl_schedule_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -boolean set::is_disjoint(const isl::set &set2) const +// implementations for isl::schedule_node_mark +schedule_node_mark::schedule_node_mark() + : schedule_node() {} + +schedule_node_mark::schedule_node_mark(const schedule_node_mark &obj) + : schedule_node(obj) { - auto res = isl_set_is_disjoint(get(), set2.get()); - return manage(res); } -boolean set::is_empty() const -{ - auto res = isl_set_is_empty(get()); - return manage(res); +schedule_node_mark::schedule_node_mark(__isl_take isl_schedule_node *ptr) + : schedule_node(ptr) {} + +schedule_node_mark &schedule_node_mark::operator=(schedule_node_mark obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -boolean set::is_equal(const isl::set &set2) const -{ - auto res = isl_set_is_equal(get(), set2.get()); - return manage(res); +isl::ctx schedule_node_mark::ctx() const { + return isl::ctx(isl_schedule_node_get_ctx(ptr)); } -boolean set::is_params() const +isl::id schedule_node_mark::id() const { - auto res = isl_set_is_params(get()); + auto res = isl_schedule_node_mark_get_id(get()); return manage(res); } -boolean set::is_singleton() const +isl::id schedule_node_mark::get_id() const { - auto res = isl_set_is_singleton(get()); - return manage(res); + return id(); } -boolean set::is_strict_subset(const isl::set &set2) const +inline std::ostream &operator<<(std::ostream &os, const schedule_node_mark &obj) { - auto res = isl_set_is_strict_subset(get(), set2.get()); - return manage(res); + char *str = isl_schedule_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -boolean set::is_subset(const isl::set &set2) const +// implementations for isl::schedule_node_sequence +schedule_node_sequence::schedule_node_sequence() + : schedule_node() {} + +schedule_node_sequence::schedule_node_sequence(const schedule_node_sequence &obj) + : schedule_node(obj) { - auto res = isl_set_is_subset(get(), set2.get()); - return manage(res); } -boolean set::is_wrapping() const -{ - auto res = isl_set_is_wrapping(get()); - return manage(res); +schedule_node_sequence::schedule_node_sequence(__isl_take isl_schedule_node *ptr) + : schedule_node(ptr) {} + +schedule_node_sequence &schedule_node_sequence::operator=(schedule_node_sequence obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::map set::lex_ge_set(isl::set set2) const -{ - auto res = isl_set_lex_ge_set(copy(), set2.release()); - return manage(res); +isl::ctx schedule_node_sequence::ctx() const { + return isl::ctx(isl_schedule_node_get_ctx(ptr)); } -isl::map set::lex_gt_set(isl::set set2) const +inline std::ostream &operator<<(std::ostream &os, const schedule_node_sequence &obj) { - auto res = isl_set_lex_gt_set(copy(), set2.release()); - return manage(res); + char *str = isl_schedule_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::map set::lex_lt_set(isl::set set2) const +// implementations for isl::schedule_node_set +schedule_node_set::schedule_node_set() + : schedule_node() {} + +schedule_node_set::schedule_node_set(const schedule_node_set &obj) + : schedule_node(obj) { - auto res = isl_set_lex_lt_set(copy(), set2.release()); - return manage(res); } -isl::set set::lexmax() const -{ - auto res = isl_set_lexmax(copy()); - return manage(res); +schedule_node_set::schedule_node_set(__isl_take isl_schedule_node *ptr) + : schedule_node(ptr) {} + +schedule_node_set &schedule_node_set::operator=(schedule_node_set obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::pw_multi_aff set::lexmax_pw_multi_aff() const -{ - auto res = isl_set_lexmax_pw_multi_aff(copy()); - return manage(res); +isl::ctx schedule_node_set::ctx() const { + return isl::ctx(isl_schedule_node_get_ctx(ptr)); } -isl::set set::lexmin() const +inline std::ostream &operator<<(std::ostream &os, const schedule_node_set &obj) { - auto res = isl_set_lexmin(copy()); - return manage(res); + char *str = isl_schedule_node_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::pw_multi_aff set::lexmin_pw_multi_aff() const -{ - auto res = isl_set_lexmin_pw_multi_aff(copy()); - return manage(res); +// implementations for isl::set +set manage(__isl_take isl_set *ptr) { + return set(ptr); +} +set manage_copy(__isl_keep isl_set *ptr) { + ptr = isl_set_copy(ptr); + return set(ptr); } -isl::set set::lower_bound(isl::multi_pw_aff lower) const +set::set() + : ptr(nullptr) {} + +set::set(const set &obj) + : ptr(nullptr) { - auto res = isl_set_lower_bound_multi_pw_aff(copy(), lower.release()); - return manage(res); + ptr = obj.copy(); } -isl::set set::lower_bound(isl::multi_val lower) const +set::set(__isl_take isl_set *ptr) + : ptr(ptr) {} + +set::set(isl::basic_set bset) { - auto res = isl_set_lower_bound_multi_val(copy(), lower.release()); - return manage(res); + auto res = isl_set_from_basic_set(bset.release()); + ptr = res; } -isl::set set::lower_bound_si(isl::dim type, unsigned int pos, int value) const +set::set(isl::point pnt) { - auto res = isl_set_lower_bound_si(copy(), static_cast(type), pos, value); - return manage(res); + auto res = isl_set_from_point(pnt.release()); + ptr = res; } -isl::set set::lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const +set::set(isl::union_set uset) { - auto res = isl_set_lower_bound_val(copy(), static_cast(type), pos, value.release()); - return manage(res); + auto res = isl_set_from_union_set(uset.release()); + ptr = res; } -isl::multi_pw_aff set::max_multi_pw_aff() const +set::set(isl::ctx ctx, const std::string &str) { - auto res = isl_set_max_multi_pw_aff(copy()); - return manage(res); + auto res = isl_set_read_from_str(ctx.release(), str.c_str()); + ptr = res; } -isl::val set::max_val(const isl::aff &obj) const -{ - auto res = isl_set_max_val(get(), obj.get()); - return manage(res); +set &set::operator=(set obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::multi_pw_aff set::min_multi_pw_aff() const -{ - auto res = isl_set_min_multi_pw_aff(copy()); - return manage(res); +set::~set() { + if (ptr) + isl_set_free(ptr); } -isl::val set::min_val(const isl::aff &obj) const -{ - auto res = isl_set_min_val(get(), obj.get()); - return manage(res); +__isl_give isl_set *set::copy() const & { + return isl_set_copy(ptr); } -isl::set set::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const -{ - auto res = isl_set_move_dims(copy(), static_cast(dst_type), dst_pos, static_cast(src_type), src_pos, n); - return manage(res); +__isl_keep isl_set *set::get() const { + return ptr; } -isl_size set::n_basic_set() const -{ - auto res = isl_set_n_basic_set(get()); - return res; +__isl_give isl_set *set::release() { + isl_set *tmp = ptr; + ptr = nullptr; + return tmp; } -isl_size set::n_dim() const -{ - auto res = isl_set_n_dim(get()); - return res; +bool set::is_null() const { + return ptr == nullptr; } -isl::set set::nat_universe(isl::space space) -{ - auto res = isl_set_nat_universe(space.release()); - return manage(res); +isl::ctx set::ctx() const { + return isl::ctx(isl_set_get_ctx(ptr)); } -isl::set set::neg() const +isl::set set::add_constraint(isl::constraint constraint) const { - auto res = isl_set_neg(copy()); + auto res = isl_set_add_constraint(copy(), constraint.release()); return manage(res); } -isl::set set::params() const +isl::set set::add_dims(isl::dim type, unsigned int n) const { - auto res = isl_set_params(copy()); + auto res = isl_set_add_dims(copy(), static_cast(type), n); return manage(res); } -int set::plain_cmp(const isl::set &set2) const +isl::basic_set set::affine_hull() const { - auto res = isl_set_plain_cmp(get(), set2.get()); - return res; + auto res = isl_set_affine_hull(copy()); + return manage(res); } -isl::val set::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const +isl::set set::align_params(isl::space model) const { - auto res = isl_set_plain_get_val_if_fixed(get(), static_cast(type), pos); + auto res = isl_set_align_params(copy(), model.release()); return manage(res); } -boolean set::plain_is_disjoint(const isl::set &set2) const +isl::set set::apply(isl::map map) const { - auto res = isl_set_plain_is_disjoint(get(), set2.get()); + auto res = isl_set_apply(copy(), map.release()); return manage(res); } -boolean set::plain_is_empty() const +isl::union_set set::apply(const isl::union_map &umap) const { - auto res = isl_set_plain_is_empty(get()); - return manage(res); + return isl::union_set(*this).apply(umap); } -boolean set::plain_is_equal(const isl::set &set2) const +isl::set set::apply(const isl::basic_map &map) const { - auto res = isl_set_plain_is_equal(get(), set2.get()); - return manage(res); + return this->apply(isl::map(map)); } -boolean set::plain_is_universe() const +isl::pw_multi_aff set::as_pw_multi_aff() const { - auto res = isl_set_plain_is_universe(get()); + auto res = isl_set_as_pw_multi_aff(copy()); return manage(res); } -isl::basic_set set::plain_unshifted_simple_hull() const +isl::set set::as_set() const { - auto res = isl_set_plain_unshifted_simple_hull(copy()); - return manage(res); + return isl::union_set(*this).as_set(); } -isl::basic_set set::polyhedral_hull() const +isl::basic_set_list set::basic_set_list() const { - auto res = isl_set_polyhedral_hull(copy()); + auto res = isl_set_get_basic_set_list(get()); return manage(res); } -isl::set set::preimage(isl::multi_aff ma) const +isl::basic_set_list set::get_basic_set_list() const { - auto res = isl_set_preimage_multi_aff(copy(), ma.release()); - return manage(res); + return basic_set_list(); } -isl::set set::preimage(isl::multi_pw_aff mpa) const +isl::set set::bind(isl::multi_id tuple) const { - auto res = isl_set_preimage_multi_pw_aff(copy(), mpa.release()); + auto res = isl_set_bind(copy(), tuple.release()); return manage(res); } -isl::set set::preimage(isl::pw_multi_aff pma) const +isl::set set::coalesce() const { - auto res = isl_set_preimage_pw_multi_aff(copy(), pma.release()); + auto res = isl_set_coalesce(copy()); return manage(res); } -isl::set set::product(isl::set set2) const +isl::set set::complement() const { - auto res = isl_set_product(copy(), set2.release()); + auto res = isl_set_complement(copy()); return manage(res); } -isl::map set::project_onto_map(isl::dim type, unsigned int first, unsigned int n) const +isl::union_set set::compute_divs() const { - auto res = isl_set_project_onto_map(copy(), static_cast(type), first, n); - return manage(res); + return isl::union_set(*this).compute_divs(); } -isl::set set::project_out(isl::dim type, unsigned int first, unsigned int n) const +boolean set::contains(const isl::space &space) const { - auto res = isl_set_project_out(copy(), static_cast(type), first, n); - return manage(res); + return isl::union_set(*this).contains(space); } -isl::set set::project_out_all_params() const +isl::basic_set set::convex_hull() const { - auto res = isl_set_project_out_all_params(copy()); + auto res = isl_set_convex_hull(copy()); return manage(res); } -isl::set set::project_out_param(isl::id id) const +isl::set set::detect_equalities() const { - auto res = isl_set_project_out_param_id(copy(), id.release()); + auto res = isl_set_detect_equalities(copy()); return manage(res); } -isl::set set::project_out_param(isl::id_list list) const +class size set::dim(isl::dim type) const { - auto res = isl_set_project_out_param_id_list(copy(), list.release()); + auto res = isl_set_dim(get(), static_cast(type)); return manage(res); } -isl::set set::remove_dims(isl::dim type, unsigned int first, unsigned int n) const +boolean set::dim_has_any_lower_bound(isl::dim type, unsigned int pos) const { - auto res = isl_set_remove_dims(copy(), static_cast(type), first, n); + auto res = isl_set_dim_has_any_lower_bound(get(), static_cast(type), pos); return manage(res); } -isl::set set::remove_divs() const +isl::id set::dim_id(isl::dim type, unsigned int pos) const { - auto res = isl_set_remove_divs(copy()); + auto res = isl_set_get_dim_id(get(), static_cast(type), pos); return manage(res); } -isl::set set::remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::id set::get_dim_id(isl::dim type, unsigned int pos) const { - auto res = isl_set_remove_divs_involving_dims(copy(), static_cast(type), first, n); - return manage(res); + return dim_id(type, pos); } -isl::set set::remove_redundancies() const +isl::pw_aff set::dim_max(int pos) const { - auto res = isl_set_remove_redundancies(copy()); + auto res = isl_set_dim_max(copy(), pos); return manage(res); } -isl::set set::remove_unknown_divs() const +isl::val set::dim_max_val(int pos) const { - auto res = isl_set_remove_unknown_divs(copy()); + auto res = isl_set_dim_max_val(copy(), pos); return manage(res); } -isl::set set::reset_space(isl::space space) const +isl::pw_aff set::dim_min(int pos) const { - auto res = isl_set_reset_space(copy(), space.release()); + auto res = isl_set_dim_min(copy(), pos); return manage(res); } -isl::set set::reset_tuple_id() const +isl::val set::dim_min_val(int pos) const { - auto res = isl_set_reset_tuple_id(copy()); + auto res = isl_set_dim_min_val(copy(), pos); return manage(res); } -isl::set set::reset_user() const +std::string set::dim_name(isl::dim type, unsigned int pos) const { - auto res = isl_set_reset_user(copy()); - return manage(res); + auto res = isl_set_get_dim_name(get(), static_cast(type), pos); + std::string tmp(res); + return tmp; } -isl::basic_set set::sample() const +std::string set::get_dim_name(isl::dim type, unsigned int pos) const { - auto res = isl_set_sample(copy()); - return manage(res); + return dim_name(type, pos); } -isl::point set::sample_point() const +isl::set set::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_set_sample_point(copy()); + auto res = isl_set_drop_constraints_involving_dims(copy(), static_cast(type), first, n); return manage(res); } -isl::set set::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const +isl::set set::eliminate(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_set_set_dim_id(copy(), static_cast(type), pos, id.release()); + auto res = isl_set_eliminate(copy(), static_cast(type), first, n); return manage(res); } -isl::set set::set_tuple_id(isl::id id) const +isl::set set::empty(isl::space space) { - auto res = isl_set_set_tuple_id(copy(), id.release()); + auto res = isl_set_empty(space.release()); return manage(res); } -isl::set set::set_tuple_name(const std::string &s) const +boolean set::every_set(const std::function &test) const { - auto res = isl_set_set_tuple_name(copy(), s.c_str()); - return manage(res); + return isl::union_set(*this).every_set(test); } -isl::basic_set set::simple_hull() const +isl::set set::extract_set(const isl::space &space) const { - auto res = isl_set_simple_hull(copy()); - return manage(res); + return isl::union_set(*this).extract_set(space); } -int set::size() const +int set::find_dim_by_id(isl::dim type, const isl::id &id) const { - auto res = isl_set_size(get()); + auto res = isl_set_find_dim_by_id(get(), static_cast(type), id.get()); return res; } -isl::basic_set set::solutions() const +int set::find_dim_by_id(isl::dim type, const std::string &id) const { - auto res = isl_set_solutions(copy()); - return manage(res); + return this->find_dim_by_id(type, isl::id(ctx(), id)); } -isl::set set::split_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::set set::fix_si(isl::dim type, unsigned int pos, int value) const { - auto res = isl_set_split_dims(copy(), static_cast(type), first, n); + auto res = isl_set_fix_si(copy(), static_cast(type), pos, value); return manage(res); } -isl::set set::subtract(isl::set set2) const +isl::set set::flatten() const { - auto res = isl_set_subtract(copy(), set2.release()); + auto res = isl_set_flatten(copy()); return manage(res); } -isl::set set::sum(isl::set set2) const +stat set::foreach_basic_set(const std::function &fn) const { - auto res = isl_set_sum(copy(), set2.release()); + struct fn_data { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_basic_set *arg_0, void *arg_1) -> isl_stat { + auto *data = static_cast(arg_1); + auto ret = (data->func)(manage(arg_0)); + return ret.release(); + }; + auto res = isl_set_foreach_basic_set(get(), fn_lambda, &fn_data); return manage(res); } -isl::map set::translation() const +stat set::foreach_point(const std::function &fn) const { - auto res = isl_set_translation(copy()); + struct fn_data { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_point *arg_0, void *arg_1) -> isl_stat { + auto *data = static_cast(arg_1); + auto ret = (data->func)(manage(arg_0)); + return ret.release(); + }; + auto res = isl_set_foreach_point(get(), fn_lambda, &fn_data); return manage(res); } -isl_size set::tuple_dim() const +stat set::foreach_set(const std::function &fn) const { - auto res = isl_set_tuple_dim(get()); - return res; + return isl::union_set(*this).foreach_set(fn); } -isl::set set::unbind_params(isl::multi_id tuple) const +isl::set set::gist(isl::set context) const { - auto res = isl_set_unbind_params(copy(), tuple.release()); + auto res = isl_set_gist(copy(), context.release()); return manage(res); } -isl::map set::unbind_params_insert_domain(isl::multi_id domain) const +isl::union_set set::gist(const isl::union_set &context) const { - auto res = isl_set_unbind_params_insert_domain(copy(), domain.release()); - return manage(res); + return isl::union_set(*this).gist(context); } -isl::set set::unite(isl::set set2) const +isl::set set::gist(const isl::basic_set &context) const { - auto res = isl_set_union(copy(), set2.release()); - return manage(res); + return this->gist(isl::set(context)); } -isl::set set::universe(isl::space space) +isl::set set::gist(const isl::point &context) const { - auto res = isl_set_universe(space.release()); - return manage(res); + return this->gist(isl::set(context)); } -isl::basic_set set::unshifted_simple_hull() const +isl::set set::gist_params(isl::set context) const { - auto res = isl_set_unshifted_simple_hull(copy()); + auto res = isl_set_gist_params(copy(), context.release()); return manage(res); } -isl::basic_set set::unshifted_simple_hull_from_set_list(isl::set_list list) const +boolean set::has_equal_space(const isl::set &set2) const { - auto res = isl_set_unshifted_simple_hull_from_set_list(copy(), list.release()); + auto res = isl_set_has_equal_space(get(), set2.get()); return manage(res); } -isl::map set::unwrap() const +isl::map set::identity() const { - auto res = isl_set_unwrap(copy()); + auto res = isl_set_identity(copy()); return manage(res); } -isl::set set::upper_bound(isl::multi_pw_aff upper) const +isl::union_pw_multi_aff set::identity_union_pw_multi_aff() const { - auto res = isl_set_upper_bound_multi_pw_aff(copy(), upper.release()); - return manage(res); + return isl::union_set(*this).identity_union_pw_multi_aff(); } -isl::set set::upper_bound(isl::multi_val upper) const +isl::pw_aff set::indicator_function() const { - auto res = isl_set_upper_bound_multi_val(copy(), upper.release()); + auto res = isl_set_indicator_function(copy()); return manage(res); } -isl::set set::upper_bound_si(isl::dim type, unsigned int pos, int value) const +isl::set set::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const { - auto res = isl_set_upper_bound_si(copy(), static_cast(type), pos, value); + auto res = isl_set_insert_dims(copy(), static_cast(type), pos, n); return manage(res); } -isl::set set::upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const +isl::map set::insert_domain(isl::space domain) const { - auto res = isl_set_upper_bound_val(copy(), static_cast(type), pos, value.release()); + auto res = isl_set_insert_domain(copy(), domain.release()); return manage(res); } -isl::map set::wrapped_domain_map() const +isl::set set::intersect(isl::set set2) const { - auto res = isl_set_wrapped_domain_map(copy()); + auto res = isl_set_intersect(copy(), set2.release()); return manage(res); } -// implementations for isl::set_list -set_list manage(__isl_take isl_set_list *ptr) { - return set_list(ptr); -} -set_list manage_copy(__isl_keep isl_set_list *ptr) { - ptr = isl_set_list_copy(ptr); - return set_list(ptr); -} - -set_list::set_list() - : ptr(nullptr) {} - -set_list::set_list(const set_list &obj) - : ptr(nullptr) +isl::union_set set::intersect(const isl::union_set &uset2) const { - ptr = obj.copy(); -} - - -set_list::set_list(__isl_take isl_set_list *ptr) - : ptr(ptr) {} - - -set_list &set_list::operator=(set_list obj) { - std::swap(this->ptr, obj.ptr); - return *this; -} - -set_list::~set_list() { - if (ptr) - isl_set_list_free(ptr); -} - -__isl_give isl_set_list *set_list::copy() const & { - return isl_set_list_copy(ptr); -} - -__isl_keep isl_set_list *set_list::get() const { - return ptr; -} - -__isl_give isl_set_list *set_list::release() { - isl_set_list *tmp = ptr; - ptr = nullptr; - return tmp; -} - -bool set_list::is_null() const { - return ptr == nullptr; -} - - -isl::ctx set_list::ctx() const { - return isl::ctx(isl_set_list_get_ctx(ptr)); -} - -void set_list::dump() const { - isl_set_list_dump(get()); + return isl::union_set(*this).intersect(uset2); } - -isl::set_list set_list::add(isl::set el) const +isl::set set::intersect(const isl::basic_set &set2) const { - auto res = isl_set_list_add(copy(), el.release()); - return manage(res); + return this->intersect(isl::set(set2)); } -isl::set_list set_list::alloc(isl::ctx ctx, int n) +isl::set set::intersect(const isl::point &set2) const { - auto res = isl_set_list_alloc(ctx.release(), n); - return manage(res); + return this->intersect(isl::set(set2)); } -isl::set_list set_list::clear() const +isl::set set::intersect_params(isl::set params) const { - auto res = isl_set_list_clear(copy()); + auto res = isl_set_intersect_params(copy(), params.release()); return manage(res); } -isl::set_list set_list::concat(isl::set_list list2) const +boolean set::involves_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_set_list_concat(copy(), list2.release()); + auto res = isl_set_involves_dims(get(), static_cast(type), first, n); return manage(res); } -isl::set_list set_list::drop(unsigned int first, unsigned int n) const +boolean set::involves_locals() const { - auto res = isl_set_list_drop(copy(), first, n); + auto res = isl_set_involves_locals(get()); return manage(res); } -stat set_list::foreach(const std::function &fn) const +boolean set::is_bounded() const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_set *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_set_list_foreach(get(), fn_lambda, &fn_data); + auto res = isl_set_is_bounded(get()); return manage(res); } -isl::set_list set_list::from_set(isl::set el) +boolean set::is_disjoint(const isl::set &set2) const { - auto res = isl_set_list_from_set(el.release()); + auto res = isl_set_is_disjoint(get(), set2.get()); return manage(res); } -isl::set set_list::get_at(int index) const +boolean set::is_disjoint(const isl::union_set &uset2) const { - auto res = isl_set_list_get_at(get(), index); - return manage(res); + return isl::union_set(*this).is_disjoint(uset2); } -isl::set set_list::get_set(int index) const +boolean set::is_disjoint(const isl::basic_set &set2) const { - auto res = isl_set_list_get_set(get(), index); - return manage(res); + return this->is_disjoint(isl::set(set2)); } -isl::set_list set_list::insert(unsigned int pos, isl::set el) const +boolean set::is_disjoint(const isl::point &set2) const { - auto res = isl_set_list_insert(copy(), pos, el.release()); - return manage(res); + return this->is_disjoint(isl::set(set2)); } -isl_size set_list::n_set() const +boolean set::is_empty() const { - auto res = isl_set_list_n_set(get()); - return res; + auto res = isl_set_is_empty(get()); + return manage(res); } -isl::set_list set_list::reverse() const +boolean set::is_equal(const isl::set &set2) const { - auto res = isl_set_list_reverse(copy()); + auto res = isl_set_is_equal(get(), set2.get()); return manage(res); } -isl::set_list set_list::set_set(int index, isl::set el) const +boolean set::is_equal(const isl::union_set &uset2) const { - auto res = isl_set_list_set_set(copy(), index, el.release()); - return manage(res); + return isl::union_set(*this).is_equal(uset2); } -isl_size set_list::size() const +boolean set::is_equal(const isl::basic_set &set2) const { - auto res = isl_set_list_size(get()); - return res; + return this->is_equal(isl::set(set2)); } -isl::set_list set_list::swap(unsigned int pos1, unsigned int pos2) const +boolean set::is_equal(const isl::point &set2) const { - auto res = isl_set_list_swap(copy(), pos1, pos2); - return manage(res); + return this->is_equal(isl::set(set2)); } -isl::set set_list::unite() const +boolean set::is_params() const { - auto res = isl_set_list_union(copy()); + auto res = isl_set_is_params(get()); return manage(res); } -// implementations for isl::space -space manage(__isl_take isl_space *ptr) { - return space(ptr); -} -space manage_copy(__isl_keep isl_space *ptr) { - ptr = isl_space_copy(ptr); - return space(ptr); -} - -space::space() - : ptr(nullptr) {} - -space::space(const space &obj) - : ptr(nullptr) +boolean set::is_singleton() const { - ptr = obj.copy(); + auto res = isl_set_is_singleton(get()); + return manage(res); } - -space::space(__isl_take isl_space *ptr) - : ptr(ptr) {} - -space::space(isl::ctx ctx, unsigned int nparam, unsigned int n_in, unsigned int n_out) +boolean set::is_strict_subset(const isl::set &set2) const { - auto res = isl_space_alloc(ctx.release(), nparam, n_in, n_out); - ptr = res; + auto res = isl_set_is_strict_subset(get(), set2.get()); + return manage(res); } -space::space(isl::ctx ctx, unsigned int nparam, unsigned int dim) + +boolean set::is_strict_subset(const isl::union_set &uset2) const { - auto res = isl_space_set_alloc(ctx.release(), nparam, dim); - ptr = res; + return isl::union_set(*this).is_strict_subset(uset2); } -space &space::operator=(space obj) { - std::swap(this->ptr, obj.ptr); - return *this; +boolean set::is_strict_subset(const isl::basic_set &set2) const +{ + return this->is_strict_subset(isl::set(set2)); } -space::~space() { - if (ptr) - isl_space_free(ptr); +boolean set::is_strict_subset(const isl::point &set2) const +{ + return this->is_strict_subset(isl::set(set2)); } -__isl_give isl_space *space::copy() const & { - return isl_space_copy(ptr); +boolean set::is_subset(const isl::set &set2) const +{ + auto res = isl_set_is_subset(get(), set2.get()); + return manage(res); } -__isl_keep isl_space *space::get() const { - return ptr; +boolean set::is_subset(const isl::union_set &uset2) const +{ + return isl::union_set(*this).is_subset(uset2); } -__isl_give isl_space *space::release() { - isl_space *tmp = ptr; - ptr = nullptr; - return tmp; +boolean set::is_subset(const isl::basic_set &set2) const +{ + return this->is_subset(isl::set(set2)); } -bool space::is_null() const { - return ptr == nullptr; +boolean set::is_subset(const isl::point &set2) const +{ + return this->is_subset(isl::set(set2)); } - -isl::ctx space::ctx() const { - return isl::ctx(isl_space_get_ctx(ptr)); +boolean set::is_wrapping() const +{ + auto res = isl_set_is_wrapping(get()); + return manage(res); } -void space::dump() const { - isl_space_dump(get()); +boolean set::isa_set() const +{ + return isl::union_set(*this).isa_set(); } - -isl::space space::add_dims(isl::dim type, unsigned int n) const +isl::set set::lexmax() const { - auto res = isl_space_add_dims(copy(), static_cast(type), n); + auto res = isl_set_lexmax(copy()); return manage(res); } -isl::space space::add_named_tuple(isl::id tuple_id, unsigned int dim) const +isl::pw_multi_aff set::lexmax_pw_multi_aff() const { - auto res = isl_space_add_named_tuple_id_ui(copy(), tuple_id.release(), dim); + auto res = isl_set_lexmax_pw_multi_aff(copy()); return manage(res); } -isl::space space::add_param_id(isl::id id) const +isl::set set::lexmin() const { - auto res = isl_space_add_param_id(copy(), id.release()); + auto res = isl_set_lexmin(copy()); return manage(res); } -isl::space space::add_unnamed_tuple(unsigned int dim) const +isl::pw_multi_aff set::lexmin_pw_multi_aff() const { - auto res = isl_space_add_unnamed_tuple_ui(copy(), dim); + auto res = isl_set_lexmin_pw_multi_aff(copy()); return manage(res); } -isl::space space::align_params(isl::space space2) const +isl::set set::lower_bound(isl::multi_pw_aff lower) const { - auto res = isl_space_align_params(copy(), space2.release()); + auto res = isl_set_lower_bound_multi_pw_aff(copy(), lower.release()); return manage(res); } -boolean space::can_curry() const +isl::set set::lower_bound(isl::multi_val lower) const { - auto res = isl_space_can_curry(get()); + auto res = isl_set_lower_bound_multi_val(copy(), lower.release()); return manage(res); } -boolean space::can_range_curry() const +isl::set set::lower_bound_si(isl::dim type, unsigned int pos, int value) const { - auto res = isl_space_can_range_curry(get()); + auto res = isl_set_lower_bound_si(copy(), static_cast(type), pos, value); return manage(res); } -boolean space::can_uncurry() const +isl::set set::lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const { - auto res = isl_space_can_uncurry(get()); + auto res = isl_set_lower_bound_val(copy(), static_cast(type), pos, value.release()); return manage(res); } -boolean space::can_zip() const +isl::set set::lower_bound_val(isl::dim type, unsigned int pos, long value) const { - auto res = isl_space_can_zip(get()); - return manage(res); + return this->lower_bound_val(type, pos, isl::val(ctx(), value)); } -isl::space space::curry() const +isl::multi_pw_aff set::max_multi_pw_aff() const { - auto res = isl_space_curry(copy()); + auto res = isl_set_max_multi_pw_aff(copy()); return manage(res); } -isl_size space::dim(isl::dim type) const +isl::val set::max_val(const isl::aff &obj) const { - auto res = isl_space_dim(get(), static_cast(type)); - return res; + auto res = isl_set_max_val(get(), obj.get()); + return manage(res); } -isl::space space::domain() const +isl::multi_pw_aff set::min_multi_pw_aff() const { - auto res = isl_space_domain(copy()); + auto res = isl_set_min_multi_pw_aff(copy()); return manage(res); } -isl::space space::domain_factor_domain() const +isl::val set::min_val(const isl::aff &obj) const { - auto res = isl_space_domain_factor_domain(copy()); + auto res = isl_set_min_val(get(), obj.get()); return manage(res); } -isl::space space::domain_factor_range() const +class size set::n_basic_set() const { - auto res = isl_space_domain_factor_range(copy()); + auto res = isl_set_n_basic_set(get()); return manage(res); } -boolean space::domain_is_wrapping() const +isl::set set::params() const { - auto res = isl_space_domain_is_wrapping(get()); + auto res = isl_set_params(copy()); return manage(res); } -isl::space space::domain_map() const +isl::val set::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const { - auto res = isl_space_domain_map(copy()); + auto res = isl_set_plain_get_val_if_fixed(get(), static_cast(type), pos); return manage(res); } -isl::space space::domain_product(isl::space right) const +isl::multi_val set::plain_multi_val_if_fixed() const { - auto res = isl_space_domain_product(copy(), right.release()); + auto res = isl_set_get_plain_multi_val_if_fixed(get()); return manage(res); } -isl::space space::drop_all_params() const +isl::multi_val set::get_plain_multi_val_if_fixed() const { - auto res = isl_space_drop_all_params(copy()); - return manage(res); + return plain_multi_val_if_fixed(); } -isl::space space::drop_dims(isl::dim type, unsigned int first, unsigned int num) const +isl::basic_set set::polyhedral_hull() const { - auto res = isl_space_drop_dims(copy(), static_cast(type), first, num); + auto res = isl_set_polyhedral_hull(copy()); return manage(res); } -isl::space space::factor_domain() const +isl::set set::preimage(isl::multi_aff ma) const { - auto res = isl_space_factor_domain(copy()); + auto res = isl_set_preimage_multi_aff(copy(), ma.release()); return manage(res); } -isl::space space::factor_range() const +isl::set set::preimage(isl::multi_pw_aff mpa) const { - auto res = isl_space_factor_range(copy()); + auto res = isl_set_preimage_multi_pw_aff(copy(), mpa.release()); return manage(res); } -int space::find_dim_by_id(isl::dim type, const isl::id &id) const +isl::set set::preimage(isl::pw_multi_aff pma) const { - auto res = isl_space_find_dim_by_id(get(), static_cast(type), id.get()); - return res; + auto res = isl_set_preimage_pw_multi_aff(copy(), pma.release()); + return manage(res); } -int space::find_dim_by_name(isl::dim type, const std::string &name) const +isl::union_set set::preimage(const isl::union_pw_multi_aff &upma) const { - auto res = isl_space_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + return isl::union_set(*this).preimage(upma); } -isl::space space::flatten_domain() const +isl::set set::product(isl::set set2) const { - auto res = isl_space_flatten_domain(copy()); + auto res = isl_set_product(copy(), set2.release()); return manage(res); } -isl::space space::flatten_range() const +isl::set set::project_out(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_space_flatten_range(copy()); + auto res = isl_set_project_out(copy(), static_cast(type), first, n); return manage(res); } -isl::space space::from_domain() const +isl::set set::project_out_all_params() const { - auto res = isl_space_from_domain(copy()); + auto res = isl_set_project_out_all_params(copy()); return manage(res); } -isl::space space::from_range() const +isl::set set::project_out_param(isl::id id) const { - auto res = isl_space_from_range(copy()); + auto res = isl_set_project_out_param_id(copy(), id.release()); return manage(res); } -isl::id space::get_dim_id(isl::dim type, unsigned int pos) const +isl::set set::project_out_param(const std::string &id) const { - auto res = isl_space_get_dim_id(get(), static_cast(type), pos); - return manage(res); + return this->project_out_param(isl::id(ctx(), id)); } -std::string space::get_dim_name(isl::dim type, unsigned int pos) const +isl::set set::project_out_param(isl::id_list list) const { - auto res = isl_space_get_dim_name(get(), static_cast(type), pos); - std::string tmp(res); - return tmp; + auto res = isl_set_project_out_param_id_list(copy(), list.release()); + return manage(res); } -isl::id space::get_tuple_id(isl::dim type) const +isl::pw_multi_aff set::pw_multi_aff_on_domain(isl::multi_val mv) const { - auto res = isl_space_get_tuple_id(get(), static_cast(type)); + auto res = isl_set_pw_multi_aff_on_domain_multi_val(copy(), mv.release()); return manage(res); } -std::string space::get_tuple_name(isl::dim type) const +isl::set set::remove_dims(isl::dim type, unsigned int first, unsigned int n) const { - auto res = isl_space_get_tuple_name(get(), static_cast(type)); - std::string tmp(res); - return tmp; + auto res = isl_set_remove_dims(copy(), static_cast(type), first, n); + return manage(res); } -boolean space::has_dim_id(isl::dim type, unsigned int pos) const +isl::set set::remove_divs() const { - auto res = isl_space_has_dim_id(get(), static_cast(type), pos); + auto res = isl_set_remove_divs(copy()); return manage(res); } -boolean space::has_dim_name(isl::dim type, unsigned int pos) const +isl::set set::remove_redundancies() const { - auto res = isl_space_has_dim_name(get(), static_cast(type), pos); + auto res = isl_set_remove_redundancies(copy()); return manage(res); } -boolean space::has_equal_params(const isl::space &space2) const +isl::set set::reset_tuple_id() const { - auto res = isl_space_has_equal_params(get(), space2.get()); + auto res = isl_set_reset_tuple_id(copy()); return manage(res); } -boolean space::has_equal_tuples(const isl::space &space2) const +isl::basic_set set::sample() const { - auto res = isl_space_has_equal_tuples(get(), space2.get()); + auto res = isl_set_sample(copy()); return manage(res); } -boolean space::has_tuple_id(isl::dim type) const +isl::point set::sample_point() const { - auto res = isl_space_has_tuple_id(get(), static_cast(type)); + auto res = isl_set_sample_point(copy()); return manage(res); } -boolean space::has_tuple_name(isl::dim type) const +isl::set set::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const { - auto res = isl_space_has_tuple_name(get(), static_cast(type)); + auto res = isl_set_set_dim_id(copy(), static_cast(type), pos, id.release()); return manage(res); } -isl::space space::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const +isl::set set::set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const { - auto res = isl_space_insert_dims(copy(), static_cast(type), pos, n); - return manage(res); + return this->set_dim_id(type, pos, isl::id(ctx(), id)); } -boolean space::is_domain(const isl::space &space2) const +isl::set_list set::set_list() const { - auto res = isl_space_is_domain(get(), space2.get()); - return manage(res); + return isl::union_set(*this).set_list(); } -boolean space::is_equal(const isl::space &space2) const +isl::set set::set_tuple_id(isl::id id) const { - auto res = isl_space_is_equal(get(), space2.get()); + auto res = isl_set_set_tuple_id(copy(), id.release()); return manage(res); } -boolean space::is_map() const +isl::set set::set_tuple_id(const std::string &id) const { - auto res = isl_space_is_map(get()); - return manage(res); + return this->set_tuple_id(isl::id(ctx(), id)); } -boolean space::is_params() const +isl::fixed_box set::simple_fixed_box_hull() const { - auto res = isl_space_is_params(get()); + auto res = isl_set_get_simple_fixed_box_hull(get()); return manage(res); } -boolean space::is_product() const +isl::fixed_box set::get_simple_fixed_box_hull() const { - auto res = isl_space_is_product(get()); - return manage(res); + return simple_fixed_box_hull(); } -boolean space::is_range(const isl::space &space2) const +isl::basic_set set::simple_hull() const { - auto res = isl_space_is_range(get(), space2.get()); + auto res = isl_set_simple_hull(copy()); return manage(res); } -boolean space::is_set() const +isl::space set::space() const { - auto res = isl_space_is_set(get()); + auto res = isl_set_get_space(get()); return manage(res); } -boolean space::is_wrapping() const +isl::space set::get_space() const { - auto res = isl_space_is_wrapping(get()); - return manage(res); + return space(); } -isl::space space::join(isl::space right) const +isl::val set::stride(int pos) const { - auto res = isl_space_join(copy(), right.release()); + auto res = isl_set_get_stride(get(), pos); return manage(res); } -isl::space space::map_from_domain_and_range(isl::space range) const +isl::val set::get_stride(int pos) const { - auto res = isl_space_map_from_domain_and_range(copy(), range.release()); - return manage(res); + return stride(pos); } -isl::space space::map_from_set() const +isl::set set::subtract(isl::set set2) const { - auto res = isl_space_map_from_set(copy()); + auto res = isl_set_subtract(copy(), set2.release()); return manage(res); } -isl::space space::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const +isl::union_set set::subtract(const isl::union_set &uset2) const { - auto res = isl_space_move_dims(copy(), static_cast(dst_type), dst_pos, static_cast(src_type), src_pos, n); - return manage(res); + return isl::union_set(*this).subtract(uset2); } -isl::space space::params() const +isl::set set::subtract(const isl::basic_set &set2) const { - auto res = isl_space_params(copy()); - return manage(res); + return this->subtract(isl::set(set2)); } -isl::space space::params_alloc(isl::ctx ctx, unsigned int nparam) +isl::set set::subtract(const isl::point &set2) const { - auto res = isl_space_params_alloc(ctx.release(), nparam); - return manage(res); + return this->subtract(isl::set(set2)); } -isl::space space::product(isl::space right) const +isl::set_list set::to_list() const { - auto res = isl_space_product(copy(), right.release()); + auto res = isl_set_to_list(copy()); return manage(res); } -isl::space space::range() const +isl::union_set set::to_union_set() const { - auto res = isl_space_range(copy()); + auto res = isl_set_to_union_set(copy()); return manage(res); } -isl::space space::range_curry() const +isl::map set::translation() const { - auto res = isl_space_range_curry(copy()); + auto res = isl_set_translation(copy()); return manage(res); } -isl::space space::range_factor_domain() const +class size set::tuple_dim() const { - auto res = isl_space_range_factor_domain(copy()); + auto res = isl_set_tuple_dim(get()); return manage(res); } -isl::space space::range_factor_range() const +isl::id set::tuple_id() const { - auto res = isl_space_range_factor_range(copy()); + auto res = isl_set_get_tuple_id(get()); return manage(res); } -boolean space::range_is_wrapping() const +isl::id set::get_tuple_id() const { - auto res = isl_space_range_is_wrapping(get()); - return manage(res); + return tuple_id(); } -isl::space space::range_map() const +std::string set::tuple_name() const { - auto res = isl_space_range_map(copy()); - return manage(res); + auto res = isl_set_get_tuple_name(get()); + std::string tmp(res); + return tmp; } -isl::space space::range_product(isl::space right) const +std::string set::get_tuple_name() const { - auto res = isl_space_range_product(copy(), right.release()); - return manage(res); + return tuple_name(); } -isl::space space::range_reverse() const +isl::set set::unbind_params(isl::multi_id tuple) const { - auto res = isl_space_range_reverse(copy()); + auto res = isl_set_unbind_params(copy(), tuple.release()); return manage(res); } -isl::space space::reset_tuple_id(isl::dim type) const +isl::map set::unbind_params_insert_domain(isl::multi_id domain) const { - auto res = isl_space_reset_tuple_id(copy(), static_cast(type)); + auto res = isl_set_unbind_params_insert_domain(copy(), domain.release()); return manage(res); } -isl::space space::reset_user() const +isl::set set::unite(isl::set set2) const { - auto res = isl_space_reset_user(copy()); + auto res = isl_set_union(copy(), set2.release()); return manage(res); } -isl::space space::reverse() const +isl::union_set set::unite(const isl::union_set &uset2) const { - auto res = isl_space_reverse(copy()); - return manage(res); + return isl::union_set(*this).unite(uset2); } -isl::space space::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const +isl::set set::unite(const isl::basic_set &set2) const { - auto res = isl_space_set_dim_id(copy(), static_cast(type), pos, id.release()); - return manage(res); + return this->unite(isl::set(set2)); } -isl::space space::set_from_params() const +isl::set set::unite(const isl::point &set2) const { - auto res = isl_space_set_from_params(copy()); - return manage(res); + return this->unite(isl::set(set2)); } -isl::space space::set_tuple_id(isl::dim type, isl::id id) const +isl::set set::universe(isl::space space) { - auto res = isl_space_set_tuple_id(copy(), static_cast(type), id.release()); + auto res = isl_set_universe(space.release()); return manage(res); } -isl::space space::set_tuple_name(isl::dim type, const std::string &s) const +isl::basic_set set::unshifted_simple_hull() const { - auto res = isl_space_set_tuple_name(copy(), static_cast(type), s.c_str()); + auto res = isl_set_unshifted_simple_hull(copy()); return manage(res); } -boolean space::tuple_is_equal(isl::dim type1, const isl::space &space2, isl::dim type2) const +isl::map set::unwrap() const { - auto res = isl_space_tuple_is_equal(get(), static_cast(type1), space2.get(), static_cast(type2)); + auto res = isl_set_unwrap(copy()); return manage(res); } -isl::space space::uncurry() const +isl::set set::upper_bound(isl::multi_pw_aff upper) const { - auto res = isl_space_uncurry(copy()); + auto res = isl_set_upper_bound_multi_pw_aff(copy(), upper.release()); return manage(res); } -isl::space space::unit(isl::ctx ctx) +isl::set set::upper_bound(isl::multi_val upper) const { - auto res = isl_space_unit(ctx.release()); + auto res = isl_set_upper_bound_multi_val(copy(), upper.release()); return manage(res); } -isl::space space::unwrap() const +isl::set set::upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const { - auto res = isl_space_unwrap(copy()); + auto res = isl_set_upper_bound_val(copy(), static_cast(type), pos, value.release()); return manage(res); } -isl::space space::wrap() const +isl::set set::upper_bound_val(isl::dim type, unsigned int pos, long value) const { - auto res = isl_space_wrap(copy()); - return manage(res); + return this->upper_bound_val(type, pos, isl::val(ctx(), value)); } -isl::space space::zip() const +inline std::ostream &operator<<(std::ostream &os, const set &obj) { - auto res = isl_space_zip(copy()); - return manage(res); + char *str = isl_set_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -// implementations for isl::term -term manage(__isl_take isl_term *ptr) { - return term(ptr); +// implementations for isl::set_list +set_list manage(__isl_take isl_set_list *ptr) { + return set_list(ptr); } -term manage_copy(__isl_keep isl_term *ptr) { - ptr = isl_term_copy(ptr); - return term(ptr); +set_list manage_copy(__isl_keep isl_set_list *ptr) { + ptr = isl_set_list_copy(ptr); + return set_list(ptr); } -term::term() +set_list::set_list() : ptr(nullptr) {} -term::term(const term &obj) +set_list::set_list(const set_list &obj) : ptr(nullptr) { ptr = obj.copy(); } - -term::term(__isl_take isl_term *ptr) +set_list::set_list(__isl_take isl_set_list *ptr) : ptr(ptr) {} +set_list::set_list(isl::ctx ctx, int n) +{ + auto res = isl_set_list_alloc(ctx.release(), n); + ptr = res; +} + +set_list::set_list(isl::set el) +{ + auto res = isl_set_list_from_set(el.release()); + ptr = res; +} + +set_list::set_list(isl::ctx ctx, const std::string &str) +{ + auto res = isl_set_list_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} -term &term::operator=(term obj) { +set_list &set_list::operator=(set_list obj) { std::swap(this->ptr, obj.ptr); return *this; } -term::~term() { +set_list::~set_list() { if (ptr) - isl_term_free(ptr); + isl_set_list_free(ptr); } -__isl_give isl_term *term::copy() const & { - return isl_term_copy(ptr); +__isl_give isl_set_list *set_list::copy() const & { + return isl_set_list_copy(ptr); } -__isl_keep isl_term *term::get() const { +__isl_keep isl_set_list *set_list::get() const { return ptr; } -__isl_give isl_term *term::release() { - isl_term *tmp = ptr; +__isl_give isl_set_list *set_list::release() { + isl_set_list *tmp = ptr; ptr = nullptr; return tmp; } -bool term::is_null() const { +bool set_list::is_null() const { return ptr == nullptr; } +isl::ctx set_list::ctx() const { + return isl::ctx(isl_set_list_get_ctx(ptr)); +} -isl::ctx term::ctx() const { - return isl::ctx(isl_term_get_ctx(ptr)); +isl::set_list set_list::add(isl::set el) const +{ + auto res = isl_set_list_add(copy(), el.release()); + return manage(res); } +isl::set set_list::at(int index) const +{ + auto res = isl_set_list_get_at(get(), index); + return manage(res); +} -isl_size term::dim(isl::dim type) const +isl::set set_list::get_at(int index) const { - auto res = isl_term_dim(get(), static_cast(type)); - return res; + return at(index); } -isl::val term::get_coefficient_val() const +isl::set_list set_list::clear() const { - auto res = isl_term_get_coefficient_val(get()); + auto res = isl_set_list_clear(copy()); return manage(res); } -isl::aff term::get_div(unsigned int pos) const +isl::set_list set_list::concat(isl::set_list list2) const { - auto res = isl_term_get_div(get(), pos); + auto res = isl_set_list_concat(copy(), list2.release()); return manage(res); } -isl_size term::get_exp(isl::dim type, unsigned int pos) const +isl::set_list set_list::drop(unsigned int first, unsigned int n) const { - auto res = isl_term_get_exp(get(), static_cast(type), pos); - return res; + auto res = isl_set_list_drop(copy(), first, n); + return manage(res); } -// implementations for isl::union_access_info -union_access_info manage(__isl_take isl_union_access_info *ptr) { - return union_access_info(ptr); +stat set_list::foreach(const std::function &fn) const +{ + struct fn_data { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_set *arg_0, void *arg_1) -> isl_stat { + auto *data = static_cast(arg_1); + auto ret = (data->func)(manage(arg_0)); + return ret.release(); + }; + auto res = isl_set_list_foreach(get(), fn_lambda, &fn_data); + return manage(res); } -union_access_info manage_copy(__isl_keep isl_union_access_info *ptr) { - ptr = isl_union_access_info_copy(ptr); - return union_access_info(ptr); + +isl::set_list set_list::insert(unsigned int pos, isl::set el) const +{ + auto res = isl_set_list_insert(copy(), pos, el.release()); + return manage(res); } -union_access_info::union_access_info() +class size set_list::size() const +{ + auto res = isl_set_list_size(get()); + return manage(res); +} + +inline std::ostream &operator<<(std::ostream &os, const set_list &obj) +{ + char *str = isl_set_list_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} + +// implementations for isl::space +space manage(__isl_take isl_space *ptr) { + return space(ptr); +} +space manage_copy(__isl_keep isl_space *ptr) { + ptr = isl_space_copy(ptr); + return space(ptr); +} + +space::space() : ptr(nullptr) {} -union_access_info::union_access_info(const union_access_info &obj) +space::space(const space &obj) : ptr(nullptr) { ptr = obj.copy(); } - -union_access_info::union_access_info(__isl_take isl_union_access_info *ptr) +space::space(__isl_take isl_space *ptr) : ptr(ptr) {} -union_access_info::union_access_info(isl::union_map sink) +space::space(isl::ctx ctx, unsigned int nparam, unsigned int n_in, unsigned int n_out) { - auto res = isl_union_access_info_from_sink(sink.release()); + auto res = isl_space_alloc(ctx.release(), nparam, n_in, n_out); ptr = res; } -union_access_info &union_access_info::operator=(union_access_info obj) { +space::space(isl::ctx ctx, unsigned int nparam, unsigned int dim) +{ + auto res = isl_space_set_alloc(ctx.release(), nparam, dim); + ptr = res; +} + +space &space::operator=(space obj) { std::swap(this->ptr, obj.ptr); return *this; } -union_access_info::~union_access_info() { +space::~space() { if (ptr) - isl_union_access_info_free(ptr); + isl_space_free(ptr); } -__isl_give isl_union_access_info *union_access_info::copy() const & { - return isl_union_access_info_copy(ptr); +__isl_give isl_space *space::copy() const & { + return isl_space_copy(ptr); } -__isl_keep isl_union_access_info *union_access_info::get() const { +__isl_keep isl_space *space::get() const { return ptr; } -__isl_give isl_union_access_info *union_access_info::release() { - isl_union_access_info *tmp = ptr; +__isl_give isl_space *space::release() { + isl_space *tmp = ptr; ptr = nullptr; return tmp; } -bool union_access_info::is_null() const { +bool space::is_null() const { return ptr == nullptr; } - -isl::ctx union_access_info::ctx() const { - return isl::ctx(isl_union_access_info_get_ctx(ptr)); +isl::ctx space::ctx() const { + return isl::ctx(isl_space_get_ctx(ptr)); } - -isl::union_flow union_access_info::compute_flow() const +isl::space space::add_dims(isl::dim type, unsigned int n) const { - auto res = isl_union_access_info_compute_flow(copy()); + auto res = isl_space_add_dims(copy(), static_cast(type), n); return manage(res); } -isl::union_access_info union_access_info::set_kill(isl::union_map kill) const +isl::space space::add_named_tuple(isl::id tuple_id, unsigned int dim) const { - auto res = isl_union_access_info_set_kill(copy(), kill.release()); + auto res = isl_space_add_named_tuple_id_ui(copy(), tuple_id.release(), dim); return manage(res); } -isl::union_access_info union_access_info::set_may_source(isl::union_map may_source) const +isl::space space::add_named_tuple(const std::string &tuple_id, unsigned int dim) const { - auto res = isl_union_access_info_set_may_source(copy(), may_source.release()); - return manage(res); + return this->add_named_tuple(isl::id(ctx(), tuple_id), dim); } -isl::union_access_info union_access_info::set_must_source(isl::union_map must_source) const +isl::space space::add_param(isl::id id) const { - auto res = isl_union_access_info_set_must_source(copy(), must_source.release()); + auto res = isl_space_add_param_id(copy(), id.release()); return manage(res); } -isl::union_access_info union_access_info::set_schedule(isl::schedule schedule) const +isl::space space::add_param(const std::string &id) const { - auto res = isl_union_access_info_set_schedule(copy(), schedule.release()); - return manage(res); + return this->add_param(isl::id(ctx(), id)); } -isl::union_access_info union_access_info::set_schedule_map(isl::union_map schedule_map) const +isl::space space::add_unnamed_tuple(unsigned int dim) const { - auto res = isl_union_access_info_set_schedule_map(copy(), schedule_map.release()); + auto res = isl_space_add_unnamed_tuple_ui(copy(), dim); return manage(res); } -// implementations for isl::union_flow -union_flow manage(__isl_take isl_union_flow *ptr) { - return union_flow(ptr); -} -union_flow manage_copy(__isl_keep isl_union_flow *ptr) { - ptr = isl_union_flow_copy(ptr); - return union_flow(ptr); -} - -union_flow::union_flow() - : ptr(nullptr) {} - -union_flow::union_flow(const union_flow &obj) - : ptr(nullptr) +isl::space space::align_params(isl::space space2) const { - ptr = obj.copy(); -} - - -union_flow::union_flow(__isl_take isl_union_flow *ptr) - : ptr(ptr) {} - - -union_flow &union_flow::operator=(union_flow obj) { - std::swap(this->ptr, obj.ptr); - return *this; -} - -union_flow::~union_flow() { - if (ptr) - isl_union_flow_free(ptr); -} - -__isl_give isl_union_flow *union_flow::copy() const & { - return isl_union_flow_copy(ptr); -} - -__isl_keep isl_union_flow *union_flow::get() const { - return ptr; -} - -__isl_give isl_union_flow *union_flow::release() { - isl_union_flow *tmp = ptr; - ptr = nullptr; - return tmp; -} - -bool union_flow::is_null() const { - return ptr == nullptr; -} - - -isl::ctx union_flow::ctx() const { - return isl::ctx(isl_union_flow_get_ctx(ptr)); + auto res = isl_space_align_params(copy(), space2.release()); + return manage(res); } - -isl::union_map union_flow::get_full_may_dependence() const +isl::space space::curry() const { - auto res = isl_union_flow_get_full_may_dependence(get()); + auto res = isl_space_curry(copy()); return manage(res); } -isl::union_map union_flow::get_full_must_dependence() const +class size space::dim(isl::dim type) const { - auto res = isl_union_flow_get_full_must_dependence(get()); + auto res = isl_space_dim(get(), static_cast(type)); return manage(res); } -isl::union_map union_flow::get_may_dependence() const +isl::id space::dim_id(isl::dim type, unsigned int pos) const { - auto res = isl_union_flow_get_may_dependence(get()); + auto res = isl_space_get_dim_id(get(), static_cast(type), pos); return manage(res); } -isl::union_map union_flow::get_may_no_source() const +isl::id space::get_dim_id(isl::dim type, unsigned int pos) const { - auto res = isl_union_flow_get_may_no_source(get()); - return manage(res); + return dim_id(type, pos); } -isl::union_map union_flow::get_must_dependence() const +isl::space space::domain() const { - auto res = isl_union_flow_get_must_dependence(get()); + auto res = isl_space_domain(copy()); return manage(res); } -isl::union_map union_flow::get_must_no_source() const +isl::multi_aff space::domain_map_multi_aff() const { - auto res = isl_union_flow_get_must_no_source(get()); + auto res = isl_space_domain_map_multi_aff(copy()); return manage(res); } -// implementations for isl::union_map -union_map manage(__isl_take isl_union_map *ptr) { - return union_map(ptr); -} -union_map manage_copy(__isl_keep isl_union_map *ptr) { - ptr = isl_union_map_copy(ptr); - return union_map(ptr); +isl::pw_multi_aff space::domain_map_pw_multi_aff() const +{ + auto res = isl_space_domain_map_pw_multi_aff(copy()); + return manage(res); } -union_map::union_map() - : ptr(nullptr) {} - -union_map::union_map(const union_map &obj) - : ptr(nullptr) +isl::id space::domain_tuple_id() const { - ptr = obj.copy(); + auto res = isl_space_get_domain_tuple_id(get()); + return manage(res); } +isl::id space::get_domain_tuple_id() const +{ + return domain_tuple_id(); +} -union_map::union_map(__isl_take isl_union_map *ptr) - : ptr(ptr) {} +isl::space space::drop_dims(isl::dim type, unsigned int first, unsigned int num) const +{ + auto res = isl_space_drop_dims(copy(), static_cast(type), first, num); + return manage(res); +} -union_map::union_map(isl::basic_map bmap) +int space::find_dim_by_id(isl::dim type, const isl::id &id) const { - auto res = isl_union_map_from_basic_map(bmap.release()); - ptr = res; + auto res = isl_space_find_dim_by_id(get(), static_cast(type), id.get()); + return res; } -union_map::union_map(isl::map map) + +int space::find_dim_by_id(isl::dim type, const std::string &id) const { - auto res = isl_union_map_from_map(map.release()); - ptr = res; + return this->find_dim_by_id(type, isl::id(ctx(), id)); } -union_map::union_map(isl::union_pw_multi_aff upma) + +isl::space space::flatten_domain() const { - auto res = isl_union_map_from_union_pw_multi_aff(upma.release()); - ptr = res; + auto res = isl_space_flatten_domain(copy()); + return manage(res); } -union_map::union_map(isl::ctx ctx, const std::string &str) + +isl::space space::flatten_range() const { - auto res = isl_union_map_read_from_str(ctx.release(), str.c_str()); - ptr = res; + auto res = isl_space_flatten_range(copy()); + return manage(res); } -union_map &union_map::operator=(union_map obj) { - std::swap(this->ptr, obj.ptr); - return *this; +boolean space::has_domain_tuple_id() const +{ + auto res = isl_space_has_domain_tuple_id(get()); + return manage(res); } -union_map::~union_map() { - if (ptr) - isl_union_map_free(ptr); +boolean space::has_equal_tuples(const isl::space &space2) const +{ + auto res = isl_space_has_equal_tuples(get(), space2.get()); + return manage(res); } -__isl_give isl_union_map *union_map::copy() const & { - return isl_union_map_copy(ptr); +boolean space::has_range_tuple_id() const +{ + auto res = isl_space_has_range_tuple_id(get()); + return manage(res); } -__isl_keep isl_union_map *union_map::get() const { - return ptr; +boolean space::has_tuple_id(isl::dim type) const +{ + auto res = isl_space_has_tuple_id(get(), static_cast(type)); + return manage(res); } -__isl_give isl_union_map *union_map::release() { - isl_union_map *tmp = ptr; - ptr = nullptr; - return tmp; +boolean space::has_tuple_name(isl::dim type) const +{ + auto res = isl_space_has_tuple_name(get(), static_cast(type)); + return manage(res); } -bool union_map::is_null() const { - return ptr == nullptr; +isl::multi_aff space::identity_multi_aff_on_domain() const +{ + auto res = isl_space_identity_multi_aff_on_domain(copy()); + return manage(res); } +isl::multi_pw_aff space::identity_multi_pw_aff_on_domain() const +{ + auto res = isl_space_identity_multi_pw_aff_on_domain(copy()); + return manage(res); +} -isl::ctx union_map::ctx() const { - return isl::ctx(isl_union_map_get_ctx(ptr)); +isl::pw_multi_aff space::identity_pw_multi_aff_on_domain() const +{ + auto res = isl_space_identity_pw_multi_aff_on_domain(copy()); + return manage(res); } -void union_map::dump() const { - isl_union_map_dump(get()); +boolean space::is_equal(const isl::space &space2) const +{ + auto res = isl_space_is_equal(get(), space2.get()); + return manage(res); } +boolean space::is_params() const +{ + auto res = isl_space_is_params(get()); + return manage(res); +} -isl::union_map union_map::affine_hull() const +boolean space::is_set() const { - auto res = isl_union_map_affine_hull(copy()); + auto res = isl_space_is_set(get()); return manage(res); } -isl::union_map union_map::align_params(isl::space model) const +boolean space::is_wrapping() const { - auto res = isl_union_map_align_params(copy(), model.release()); + auto res = isl_space_is_wrapping(get()); return manage(res); } -isl::union_map union_map::apply_domain(isl::union_map umap2) const +isl::space space::map_from_domain_and_range(isl::space range) const { - auto res = isl_union_map_apply_domain(copy(), umap2.release()); + auto res = isl_space_map_from_domain_and_range(copy(), range.release()); return manage(res); } -isl::union_map union_map::apply_range(isl::union_map umap2) const +isl::space space::map_from_set() const { - auto res = isl_union_map_apply_range(copy(), umap2.release()); + auto res = isl_space_map_from_set(copy()); return manage(res); } -isl::union_set union_map::bind_range(isl::multi_id tuple) const +isl::multi_aff space::multi_aff(isl::aff_list list) const { - auto res = isl_union_map_bind_range(copy(), tuple.release()); + auto res = isl_space_multi_aff(copy(), list.release()); return manage(res); } -isl::union_map union_map::coalesce() const +isl::multi_aff space::multi_aff_on_domain(isl::multi_val mv) const { - auto res = isl_union_map_coalesce(copy()); + auto res = isl_space_multi_aff_on_domain_multi_val(copy(), mv.release()); return manage(res); } -boolean union_map::contains(const isl::space &space) const +isl::multi_id space::multi_id(isl::id_list list) const { - auto res = isl_union_map_contains(get(), space.get()); + auto res = isl_space_multi_id(copy(), list.release()); return manage(res); } -isl::union_map union_map::curry() const +isl::multi_pw_aff space::multi_pw_aff(isl::pw_aff_list list) const { - auto res = isl_union_map_curry(copy()); + auto res = isl_space_multi_pw_aff(copy(), list.release()); return manage(res); } -isl::union_set union_map::deltas() const +isl::multi_union_pw_aff space::multi_union_pw_aff(isl::union_pw_aff_list list) const { - auto res = isl_union_map_deltas(copy()); + auto res = isl_space_multi_union_pw_aff(copy(), list.release()); return manage(res); } -isl::union_map union_map::deltas_map() const +isl::multi_val space::multi_val(isl::val_list list) const { - auto res = isl_union_map_deltas_map(copy()); + auto res = isl_space_multi_val(copy(), list.release()); return manage(res); } -isl::union_map union_map::detect_equalities() const +isl::aff space::param_aff_on_domain(isl::id id) const { - auto res = isl_union_map_detect_equalities(copy()); + auto res = isl_space_param_aff_on_domain_id(copy(), id.release()); return manage(res); } -isl_size union_map::dim(isl::dim type) const +isl::aff space::param_aff_on_domain(const std::string &id) const { - auto res = isl_union_map_dim(get(), static_cast(type)); - return res; + return this->param_aff_on_domain(isl::id(ctx(), id)); } -isl::union_set union_map::domain() const +isl::space space::params() const { - auto res = isl_union_map_domain(copy()); + auto res = isl_space_params(copy()); return manage(res); } -isl::union_map union_map::domain_factor_domain() const +isl::space space::params_alloc(isl::ctx ctx, unsigned int nparam) { - auto res = isl_union_map_domain_factor_domain(copy()); + auto res = isl_space_params_alloc(ctx.release(), nparam); return manage(res); } -isl::union_map union_map::domain_factor_range() const +isl::space space::product(isl::space right) const { - auto res = isl_union_map_domain_factor_range(copy()); + auto res = isl_space_product(copy(), right.release()); return manage(res); } -isl::union_map union_map::domain_map() const +isl::space space::range() const { - auto res = isl_union_map_domain_map(copy()); + auto res = isl_space_range(copy()); return manage(res); } -isl::union_pw_multi_aff union_map::domain_map_union_pw_multi_aff() const +isl::multi_aff space::range_map_multi_aff() const { - auto res = isl_union_map_domain_map_union_pw_multi_aff(copy()); + auto res = isl_space_range_map_multi_aff(copy()); return manage(res); } -isl::union_map union_map::domain_product(isl::union_map umap2) const +isl::pw_multi_aff space::range_map_pw_multi_aff() const { - auto res = isl_union_map_domain_product(copy(), umap2.release()); + auto res = isl_space_range_map_pw_multi_aff(copy()); return manage(res); } -isl::union_map union_map::empty(isl::ctx ctx) +isl::space space::range_reverse() const { - auto res = isl_union_map_empty_ctx(ctx.release()); + auto res = isl_space_range_reverse(copy()); return manage(res); } -isl::union_map union_map::eq_at(isl::multi_union_pw_aff mupa) const +isl::id space::range_tuple_id() const { - auto res = isl_union_map_eq_at_multi_union_pw_aff(copy(), mupa.release()); + auto res = isl_space_get_range_tuple_id(get()); return manage(res); } -isl::map union_map::extract_map(isl::space space) const +isl::id space::get_range_tuple_id() const { - auto res = isl_union_map_extract_map(get(), space.release()); - return manage(res); + return range_tuple_id(); } -isl::union_map union_map::factor_domain() const +isl::space space::reverse() const { - auto res = isl_union_map_factor_domain(copy()); + auto res = isl_space_reverse(copy()); return manage(res); } -isl::union_map union_map::factor_range() const +isl::space space::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const { - auto res = isl_union_map_factor_range(copy()); + auto res = isl_space_set_dim_id(copy(), static_cast(type), pos, id.release()); return manage(res); } -int union_map::find_dim_by_name(isl::dim type, const std::string &name) const +isl::space space::set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const { - auto res = isl_union_map_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + return this->set_dim_id(type, pos, isl::id(ctx(), id)); } -isl::union_map union_map::fixed_power(isl::val exp) const +isl::space space::set_domain_tuple(isl::id id) const { - auto res = isl_union_map_fixed_power_val(copy(), exp.release()); + auto res = isl_space_set_domain_tuple_id(copy(), id.release()); return manage(res); } -isl::union_map union_map::flat_domain_product(isl::union_map umap2) const +isl::space space::set_domain_tuple(const std::string &id) const { - auto res = isl_union_map_flat_domain_product(copy(), umap2.release()); - return manage(res); + return this->set_domain_tuple(isl::id(ctx(), id)); } -isl::union_map union_map::flat_range_product(isl::union_map umap2) const +isl::space space::set_from_params() const { - auto res = isl_union_map_flat_range_product(copy(), umap2.release()); + auto res = isl_space_set_from_params(copy()); return manage(res); } -stat union_map::foreach_map(const std::function &fn) const +isl::space space::set_range_tuple(isl::id id) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_map *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_union_map_foreach_map(get(), fn_lambda, &fn_data); + auto res = isl_space_set_range_tuple_id(copy(), id.release()); return manage(res); } -isl::union_map union_map::from(isl::multi_union_pw_aff mupa) +isl::space space::set_range_tuple(const std::string &id) const { - auto res = isl_union_map_from_multi_union_pw_aff(mupa.release()); - return manage(res); + return this->set_range_tuple(isl::id(ctx(), id)); } -isl::union_map union_map::from_domain(isl::union_set uset) +isl::space space::set_tuple_id(isl::dim type, isl::id id) const { - auto res = isl_union_map_from_domain(uset.release()); + auto res = isl_space_set_tuple_id(copy(), static_cast(type), id.release()); return manage(res); } -isl::union_map union_map::from_domain_and_range(isl::union_set domain, isl::union_set range) +isl::space space::set_tuple_id(isl::dim type, const std::string &id) const { - auto res = isl_union_map_from_domain_and_range(domain.release(), range.release()); - return manage(res); + return this->set_tuple_id(type, isl::id(ctx(), id)); } -isl::union_map union_map::from_range(isl::union_set uset) +isl::id space::tuple_id(isl::dim type) const { - auto res = isl_union_map_from_range(uset.release()); + auto res = isl_space_get_tuple_id(get(), static_cast(type)); return manage(res); } -isl::union_map union_map::from_union_pw_aff(isl::union_pw_aff upa) +isl::id space::get_tuple_id(isl::dim type) const { - auto res = isl_union_map_from_union_pw_aff(upa.release()); - return manage(res); + return tuple_id(type); } -isl::id union_map::get_dim_id(isl::dim type, unsigned int pos) const +std::string space::tuple_name(isl::dim type) const { - auto res = isl_union_map_get_dim_id(get(), static_cast(type), pos); - return manage(res); + auto res = isl_space_get_tuple_name(get(), static_cast(type)); + std::string tmp(res); + return tmp; } -uint32_t union_map::get_hash() const +std::string space::get_tuple_name(isl::dim type) const { - auto res = isl_union_map_get_hash(get()); - return res; + return tuple_name(type); } -isl::map_list union_map::get_map_list() const +isl::space space::uncurry() const { - auto res = isl_union_map_get_map_list(get()); + auto res = isl_space_uncurry(copy()); return manage(res); } -isl::space union_map::get_space() const +isl::space space::unit(isl::ctx ctx) { - auto res = isl_union_map_get_space(get()); + auto res = isl_space_unit(ctx.release()); return manage(res); } -isl::union_map union_map::gist(isl::union_map context) const +isl::map space::universe_map() const { - auto res = isl_union_map_gist(copy(), context.release()); + auto res = isl_space_universe_map(copy()); return manage(res); } -isl::union_map union_map::gist_domain(isl::union_set uset) const +isl::set space::universe_set() const { - auto res = isl_union_map_gist_domain(copy(), uset.release()); + auto res = isl_space_universe_set(copy()); return manage(res); } -isl::union_map union_map::gist_params(isl::set set) const +isl::space space::unwrap() const { - auto res = isl_union_map_gist_params(copy(), set.release()); + auto res = isl_space_unwrap(copy()); return manage(res); } -isl::union_map union_map::gist_range(isl::union_set uset) const +isl::space space::wrap() const { - auto res = isl_union_map_gist_range(copy(), uset.release()); + auto res = isl_space_wrap(copy()); return manage(res); } -isl::union_map union_map::intersect(isl::union_map umap2) const +isl::aff space::zero_aff_on_domain() const { - auto res = isl_union_map_intersect(copy(), umap2.release()); + auto res = isl_space_zero_aff_on_domain(copy()); return manage(res); } -isl::union_map union_map::intersect_domain(isl::space space) const +isl::multi_aff space::zero_multi_aff() const { - auto res = isl_union_map_intersect_domain_space(copy(), space.release()); + auto res = isl_space_zero_multi_aff(copy()); return manage(res); } -isl::union_map union_map::intersect_domain(isl::union_set uset) const +isl::multi_pw_aff space::zero_multi_pw_aff() const { - auto res = isl_union_map_intersect_domain_union_set(copy(), uset.release()); + auto res = isl_space_zero_multi_pw_aff(copy()); return manage(res); } -isl::union_map union_map::intersect_domain_factor_domain(isl::union_map factor) const +isl::multi_union_pw_aff space::zero_multi_union_pw_aff() const { - auto res = isl_union_map_intersect_domain_factor_domain(copy(), factor.release()); + auto res = isl_space_zero_multi_union_pw_aff(copy()); return manage(res); } -isl::union_map union_map::intersect_domain_factor_range(isl::union_map factor) const +isl::multi_val space::zero_multi_val() const { - auto res = isl_union_map_intersect_domain_factor_range(copy(), factor.release()); + auto res = isl_space_zero_multi_val(copy()); return manage(res); } -isl::union_map union_map::intersect_params(isl::set set) const +inline std::ostream &operator<<(std::ostream &os, const space &obj) { - auto res = isl_union_map_intersect_params(copy(), set.release()); - return manage(res); + char *str = isl_space_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::union_map union_map::intersect_range(isl::space space) const +// implementations for isl::union_access_info +union_access_info manage(__isl_take isl_union_access_info *ptr) { + return union_access_info(ptr); +} +union_access_info manage_copy(__isl_keep isl_union_access_info *ptr) { + ptr = isl_union_access_info_copy(ptr); + return union_access_info(ptr); +} + +union_access_info::union_access_info() + : ptr(nullptr) {} + +union_access_info::union_access_info(const union_access_info &obj) + : ptr(nullptr) { - auto res = isl_union_map_intersect_range_space(copy(), space.release()); - return manage(res); + ptr = obj.copy(); } -isl::union_map union_map::intersect_range(isl::union_set uset) const +union_access_info::union_access_info(__isl_take isl_union_access_info *ptr) + : ptr(ptr) {} + +union_access_info::union_access_info(isl::union_map sink) { - auto res = isl_union_map_intersect_range_union_set(copy(), uset.release()); - return manage(res); + auto res = isl_union_access_info_from_sink(sink.release()); + ptr = res; +} + +union_access_info &union_access_info::operator=(union_access_info obj) { + std::swap(this->ptr, obj.ptr); + return *this; +} + +union_access_info::~union_access_info() { + if (ptr) + isl_union_access_info_free(ptr); } -isl::union_map union_map::intersect_range_factor_domain(isl::union_map factor) const -{ - auto res = isl_union_map_intersect_range_factor_domain(copy(), factor.release()); - return manage(res); +__isl_give isl_union_access_info *union_access_info::copy() const & { + return isl_union_access_info_copy(ptr); } -isl::union_map union_map::intersect_range_factor_range(isl::union_map factor) const -{ - auto res = isl_union_map_intersect_range_factor_range(copy(), factor.release()); - return manage(res); +__isl_keep isl_union_access_info *union_access_info::get() const { + return ptr; } -boolean union_map::involves_dims(isl::dim type, unsigned int first, unsigned int n) const -{ - auto res = isl_union_map_involves_dims(get(), static_cast(type), first, n); - return manage(res); +__isl_give isl_union_access_info *union_access_info::release() { + isl_union_access_info *tmp = ptr; + ptr = nullptr; + return tmp; } -boolean union_map::is_bijective() const -{ - auto res = isl_union_map_is_bijective(get()); - return manage(res); +bool union_access_info::is_null() const { + return ptr == nullptr; } -boolean union_map::is_disjoint(const isl::union_map &umap2) const -{ - auto res = isl_union_map_is_disjoint(get(), umap2.get()); - return manage(res); +isl::ctx union_access_info::ctx() const { + return isl::ctx(isl_union_access_info_get_ctx(ptr)); } -boolean union_map::is_empty() const +isl::union_flow union_access_info::compute_flow() const { - auto res = isl_union_map_is_empty(get()); + auto res = isl_union_access_info_compute_flow(copy()); return manage(res); } -boolean union_map::is_equal(const isl::union_map &umap2) const +isl::union_access_info union_access_info::set_kill(isl::union_map kill) const { - auto res = isl_union_map_is_equal(get(), umap2.get()); + auto res = isl_union_access_info_set_kill(copy(), kill.release()); return manage(res); } -boolean union_map::is_identity() const +isl::union_access_info union_access_info::set_may_source(isl::union_map may_source) const { - auto res = isl_union_map_is_identity(get()); + auto res = isl_union_access_info_set_may_source(copy(), may_source.release()); return manage(res); } -boolean union_map::is_injective() const +isl::union_access_info union_access_info::set_must_source(isl::union_map must_source) const { - auto res = isl_union_map_is_injective(get()); + auto res = isl_union_access_info_set_must_source(copy(), must_source.release()); return manage(res); } -boolean union_map::is_single_valued() const +isl::union_access_info union_access_info::set_schedule(isl::schedule schedule) const { - auto res = isl_union_map_is_single_valued(get()); + auto res = isl_union_access_info_set_schedule(copy(), schedule.release()); return manage(res); } -boolean union_map::is_strict_subset(const isl::union_map &umap2) const +isl::union_access_info union_access_info::set_schedule_map(isl::union_map schedule_map) const { - auto res = isl_union_map_is_strict_subset(get(), umap2.get()); + auto res = isl_union_access_info_set_schedule_map(copy(), schedule_map.release()); return manage(res); } -boolean union_map::is_subset(const isl::union_map &umap2) const +inline std::ostream &operator<<(std::ostream &os, const union_access_info &obj) { - auto res = isl_union_map_is_subset(get(), umap2.get()); - return manage(res); + char *str = isl_union_access_info_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -boolean union_map::isa_map() const -{ - auto res = isl_union_map_isa_map(get()); - return manage(res); +// implementations for isl::union_flow +union_flow manage(__isl_take isl_union_flow *ptr) { + return union_flow(ptr); } - -isl::union_map union_map::lex_ge_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const -{ - auto res = isl_union_map_lex_ge_at_multi_union_pw_aff(copy(), mupa.release()); - return manage(res); +union_flow manage_copy(__isl_keep isl_union_flow *ptr) { + ptr = isl_union_flow_copy(ptr); + return union_flow(ptr); } -isl::union_map union_map::lex_ge_union_map(isl::union_map umap2) const +union_flow::union_flow() + : ptr(nullptr) {} + +union_flow::union_flow(const union_flow &obj) + : ptr(nullptr) { - auto res = isl_union_map_lex_ge_union_map(copy(), umap2.release()); - return manage(res); + ptr = obj.copy(); } -isl::union_map union_map::lex_gt_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const -{ - auto res = isl_union_map_lex_gt_at_multi_union_pw_aff(copy(), mupa.release()); - return manage(res); +union_flow::union_flow(__isl_take isl_union_flow *ptr) + : ptr(ptr) {} + +union_flow &union_flow::operator=(union_flow obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::union_map union_map::lex_gt_union_map(isl::union_map umap2) const -{ - auto res = isl_union_map_lex_gt_union_map(copy(), umap2.release()); - return manage(res); +union_flow::~union_flow() { + if (ptr) + isl_union_flow_free(ptr); } -isl::union_map union_map::lex_le_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const -{ - auto res = isl_union_map_lex_le_at_multi_union_pw_aff(copy(), mupa.release()); - return manage(res); +__isl_give isl_union_flow *union_flow::copy() const & { + return isl_union_flow_copy(ptr); } -isl::union_map union_map::lex_le_union_map(isl::union_map umap2) const -{ - auto res = isl_union_map_lex_le_union_map(copy(), umap2.release()); - return manage(res); +__isl_keep isl_union_flow *union_flow::get() const { + return ptr; } -isl::union_map union_map::lex_lt_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const -{ - auto res = isl_union_map_lex_lt_at_multi_union_pw_aff(copy(), mupa.release()); - return manage(res); +__isl_give isl_union_flow *union_flow::release() { + isl_union_flow *tmp = ptr; + ptr = nullptr; + return tmp; } -isl::union_map union_map::lex_lt_union_map(isl::union_map umap2) const -{ - auto res = isl_union_map_lex_lt_union_map(copy(), umap2.release()); - return manage(res); +bool union_flow::is_null() const { + return ptr == nullptr; } -isl::union_map union_map::lexmax() const -{ - auto res = isl_union_map_lexmax(copy()); - return manage(res); +isl::ctx union_flow::ctx() const { + return isl::ctx(isl_union_flow_get_ctx(ptr)); } -isl::union_map union_map::lexmin() const +isl::union_map union_flow::full_may_dependence() const { - auto res = isl_union_map_lexmin(copy()); + auto res = isl_union_flow_get_full_may_dependence(get()); return manage(res); } -isl_size union_map::n_map() const +isl::union_map union_flow::get_full_may_dependence() const { - auto res = isl_union_map_n_map(get()); - return res; + return full_may_dependence(); } -isl::set union_map::params() const +isl::union_map union_flow::full_must_dependence() const { - auto res = isl_union_map_params(copy()); + auto res = isl_union_flow_get_full_must_dependence(get()); return manage(res); } -boolean union_map::plain_is_empty() const +isl::union_map union_flow::get_full_must_dependence() const { - auto res = isl_union_map_plain_is_empty(get()); - return manage(res); + return full_must_dependence(); } -boolean union_map::plain_is_injective() const +isl::union_map union_flow::may_dependence() const { - auto res = isl_union_map_plain_is_injective(get()); + auto res = isl_union_flow_get_may_dependence(get()); return manage(res); } -isl::union_map union_map::polyhedral_hull() const +isl::union_map union_flow::get_may_dependence() const { - auto res = isl_union_map_polyhedral_hull(copy()); - return manage(res); + return may_dependence(); } -isl::union_map union_map::preimage_domain(isl::multi_aff ma) const +isl::union_map union_flow::may_no_source() const { - auto res = isl_union_map_preimage_domain_multi_aff(copy(), ma.release()); + auto res = isl_union_flow_get_may_no_source(get()); return manage(res); } -isl::union_map union_map::preimage_domain(isl::multi_pw_aff mpa) const +isl::union_map union_flow::get_may_no_source() const { - auto res = isl_union_map_preimage_domain_multi_pw_aff(copy(), mpa.release()); - return manage(res); + return may_no_source(); } -isl::union_map union_map::preimage_domain(isl::pw_multi_aff pma) const +isl::union_map union_flow::must_dependence() const { - auto res = isl_union_map_preimage_domain_pw_multi_aff(copy(), pma.release()); + auto res = isl_union_flow_get_must_dependence(get()); return manage(res); } -isl::union_map union_map::preimage_domain(isl::union_pw_multi_aff upma) const +isl::union_map union_flow::get_must_dependence() const { - auto res = isl_union_map_preimage_domain_union_pw_multi_aff(copy(), upma.release()); - return manage(res); + return must_dependence(); } -isl::union_map union_map::preimage_range(isl::multi_aff ma) const +isl::union_map union_flow::must_no_source() const { - auto res = isl_union_map_preimage_range_multi_aff(copy(), ma.release()); + auto res = isl_union_flow_get_must_no_source(get()); return manage(res); } -isl::union_map union_map::preimage_range(isl::pw_multi_aff pma) const +isl::union_map union_flow::get_must_no_source() const { - auto res = isl_union_map_preimage_range_pw_multi_aff(copy(), pma.release()); - return manage(res); + return must_no_source(); } -isl::union_map union_map::preimage_range(isl::union_pw_multi_aff upma) const +inline std::ostream &operator<<(std::ostream &os, const union_flow &obj) { - auto res = isl_union_map_preimage_range_union_pw_multi_aff(copy(), upma.release()); - return manage(res); + char *str = isl_union_flow_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::union_map union_map::product(isl::union_map umap2) const -{ - auto res = isl_union_map_product(copy(), umap2.release()); - return manage(res); +// implementations for isl::union_map +union_map manage(__isl_take isl_union_map *ptr) { + return union_map(ptr); +} +union_map manage_copy(__isl_keep isl_union_map *ptr) { + ptr = isl_union_map_copy(ptr); + return union_map(ptr); } -isl::union_map union_map::project_out(isl::dim type, unsigned int first, unsigned int n) const +union_map::union_map() + : ptr(nullptr) {} + +union_map::union_map(const union_map &obj) + : ptr(nullptr) { - auto res = isl_union_map_project_out(copy(), static_cast(type), first, n); - return manage(res); + ptr = obj.copy(); } -isl::union_map union_map::project_out_all_params() const +union_map::union_map(__isl_take isl_union_map *ptr) + : ptr(ptr) {} + +union_map::union_map(isl::basic_map bmap) { - auto res = isl_union_map_project_out_all_params(copy()); - return manage(res); + auto res = isl_union_map_from_basic_map(bmap.release()); + ptr = res; } -isl::union_set union_map::range() const +union_map::union_map(isl::map map) { - auto res = isl_union_map_range(copy()); - return manage(res); + auto res = isl_union_map_from_map(map.release()); + ptr = res; } -isl::union_map union_map::range_curry() const +union_map::union_map(isl::ctx ctx, const std::string &str) { - auto res = isl_union_map_range_curry(copy()); - return manage(res); + auto res = isl_union_map_read_from_str(ctx.release(), str.c_str()); + ptr = res; } -isl::union_map union_map::range_factor_domain() const -{ - auto res = isl_union_map_range_factor_domain(copy()); - return manage(res); +union_map &union_map::operator=(union_map obj) { + std::swap(this->ptr, obj.ptr); + return *this; } -isl::union_map union_map::range_factor_range() const -{ - auto res = isl_union_map_range_factor_range(copy()); - return manage(res); +union_map::~union_map() { + if (ptr) + isl_union_map_free(ptr); } -isl::union_map union_map::range_map() const -{ - auto res = isl_union_map_range_map(copy()); - return manage(res); +__isl_give isl_union_map *union_map::copy() const & { + return isl_union_map_copy(ptr); } -isl::union_map union_map::range_product(isl::union_map umap2) const -{ - auto res = isl_union_map_range_product(copy(), umap2.release()); - return manage(res); +__isl_keep isl_union_map *union_map::get() const { + return ptr; } -isl::union_map union_map::range_reverse() const -{ - auto res = isl_union_map_range_reverse(copy()); - return manage(res); +__isl_give isl_union_map *union_map::release() { + isl_union_map *tmp = ptr; + ptr = nullptr; + return tmp; } -isl::union_map union_map::remove_divs() const -{ - auto res = isl_union_map_remove_divs(copy()); - return manage(res); +bool union_map::is_null() const { + return ptr == nullptr; } -isl::union_map union_map::remove_redundancies() const -{ - auto res = isl_union_map_remove_redundancies(copy()); - return manage(res); +isl::ctx union_map::ctx() const { + return isl::ctx(isl_union_map_get_ctx(ptr)); } -isl::union_map union_map::reset_user() const +isl::union_map union_map::affine_hull() const { - auto res = isl_union_map_reset_user(copy()); + auto res = isl_union_map_affine_hull(copy()); return manage(res); } -isl::union_map union_map::reverse() const +isl::union_map union_map::apply_domain(isl::union_map umap2) const { - auto res = isl_union_map_reverse(copy()); + auto res = isl_union_map_apply_domain(copy(), umap2.release()); return manage(res); } -isl::basic_map union_map::sample() const +isl::union_map union_map::apply_range(isl::union_map umap2) const { - auto res = isl_union_map_sample(copy()); + auto res = isl_union_map_apply_range(copy(), umap2.release()); return manage(res); } -isl::union_map union_map::simple_hull() const +isl::map union_map::as_map() const { - auto res = isl_union_map_simple_hull(copy()); + auto res = isl_union_map_as_map(copy()); return manage(res); } -isl::union_map union_map::subtract(isl::union_map umap2) const +isl::multi_union_pw_aff union_map::as_multi_union_pw_aff() const { - auto res = isl_union_map_subtract(copy(), umap2.release()); + auto res = isl_union_map_as_multi_union_pw_aff(copy()); return manage(res); } -isl::union_map union_map::subtract_domain(isl::union_set dom) const +isl::union_pw_multi_aff union_map::as_union_pw_multi_aff() const { - auto res = isl_union_map_subtract_domain(copy(), dom.release()); + auto res = isl_union_map_as_union_pw_multi_aff(copy()); return manage(res); } -isl::union_map union_map::subtract_range(isl::union_set dom) const +isl::union_set union_map::bind_range(isl::multi_id tuple) const { - auto res = isl_union_map_subtract_range(copy(), dom.release()); + auto res = isl_union_map_bind_range(copy(), tuple.release()); return manage(res); } -isl::union_map union_map::uncurry() const +isl::union_map union_map::coalesce() const { - auto res = isl_union_map_uncurry(copy()); + auto res = isl_union_map_coalesce(copy()); return manage(res); } -isl::union_map union_map::unite(isl::union_map umap2) const +isl::union_map union_map::compute_divs() const { - auto res = isl_union_map_union(copy(), umap2.release()); + auto res = isl_union_map_compute_divs(copy()); return manage(res); } -isl::union_map union_map::universe() const +isl::union_map union_map::curry() const { - auto res = isl_union_map_universe(copy()); + auto res = isl_union_map_curry(copy()); return manage(res); } -isl::union_set union_map::wrap() const +isl::union_set union_map::deltas() const { - auto res = isl_union_map_wrap(copy()); + auto res = isl_union_map_deltas(copy()); return manage(res); } -isl::union_map union_map::zip() const +isl::union_map union_map::detect_equalities() const { - auto res = isl_union_map_zip(copy()); + auto res = isl_union_map_detect_equalities(copy()); return manage(res); } -// implementations for isl::union_map_list -union_map_list manage(__isl_take isl_union_map_list *ptr) { - return union_map_list(ptr); -} -union_map_list manage_copy(__isl_keep isl_union_map_list *ptr) { - ptr = isl_union_map_list_copy(ptr); - return union_map_list(ptr); +isl::union_set union_map::domain() const +{ + auto res = isl_union_map_domain(copy()); + return manage(res); } -union_map_list::union_map_list() - : ptr(nullptr) {} - -union_map_list::union_map_list(const union_map_list &obj) - : ptr(nullptr) +isl::union_map union_map::domain_factor_domain() const { - ptr = obj.copy(); + auto res = isl_union_map_domain_factor_domain(copy()); + return manage(res); } - -union_map_list::union_map_list(__isl_take isl_union_map_list *ptr) - : ptr(ptr) {} - - -union_map_list &union_map_list::operator=(union_map_list obj) { - std::swap(this->ptr, obj.ptr); - return *this; +isl::union_map union_map::domain_factor_range() const +{ + auto res = isl_union_map_domain_factor_range(copy()); + return manage(res); } -union_map_list::~union_map_list() { - if (ptr) - isl_union_map_list_free(ptr); +isl::union_map union_map::domain_map() const +{ + auto res = isl_union_map_domain_map(copy()); + return manage(res); } -__isl_give isl_union_map_list *union_map_list::copy() const & { - return isl_union_map_list_copy(ptr); +isl::union_pw_multi_aff union_map::domain_map_union_pw_multi_aff() const +{ + auto res = isl_union_map_domain_map_union_pw_multi_aff(copy()); + return manage(res); } -__isl_keep isl_union_map_list *union_map_list::get() const { - return ptr; +isl::union_map union_map::domain_product(isl::union_map umap2) const +{ + auto res = isl_union_map_domain_product(copy(), umap2.release()); + return manage(res); } -__isl_give isl_union_map_list *union_map_list::release() { - isl_union_map_list *tmp = ptr; - ptr = nullptr; - return tmp; +isl::union_map union_map::empty(isl::ctx ctx) +{ + auto res = isl_union_map_empty_ctx(ctx.release()); + return manage(res); } -bool union_map_list::is_null() const { - return ptr == nullptr; +isl::union_map union_map::eq_at(isl::multi_union_pw_aff mupa) const +{ + auto res = isl_union_map_eq_at_multi_union_pw_aff(copy(), mupa.release()); + return manage(res); } - -isl::ctx union_map_list::ctx() const { - return isl::ctx(isl_union_map_list_get_ctx(ptr)); +boolean union_map::every_map(const std::function &test) const +{ + struct test_data { + std::function func; + } test_data = { test }; + auto test_lambda = [](isl_map *arg_0, void *arg_1) -> isl_bool { + auto *data = static_cast(arg_1); + auto ret = (data->func)(manage_copy(arg_0)); + return ret.release(); + }; + auto res = isl_union_map_every_map(get(), test_lambda, &test_data); + return manage(res); } -void union_map_list::dump() const { - isl_union_map_list_dump(get()); +isl::map union_map::extract_map(isl::space space) const +{ + auto res = isl_union_map_extract_map(get(), space.release()); + return manage(res); } - -isl::union_map_list union_map_list::add(isl::union_map el) const +isl::union_map union_map::factor_domain() const { - auto res = isl_union_map_list_add(copy(), el.release()); + auto res = isl_union_map_factor_domain(copy()); return manage(res); } -isl::union_map_list union_map_list::alloc(isl::ctx ctx, int n) +isl::union_map union_map::factor_range() const { - auto res = isl_union_map_list_alloc(ctx.release(), n); + auto res = isl_union_map_factor_range(copy()); return manage(res); } -isl::union_map_list union_map_list::clear() const +isl::union_map union_map::fixed_power(isl::val exp) const { - auto res = isl_union_map_list_clear(copy()); + auto res = isl_union_map_fixed_power_val(copy(), exp.release()); return manage(res); } -isl::union_map_list union_map_list::concat(isl::union_map_list list2) const +isl::union_map union_map::fixed_power(long exp) const { - auto res = isl_union_map_list_concat(copy(), list2.release()); - return manage(res); + return this->fixed_power(isl::val(ctx(), exp)); } -isl::union_map_list union_map_list::drop(unsigned int first, unsigned int n) const +isl::union_map union_map::flat_range_product(isl::union_map umap2) const { - auto res = isl_union_map_list_drop(copy(), first, n); + auto res = isl_union_map_flat_range_product(copy(), umap2.release()); return manage(res); } -stat union_map_list::foreach(const std::function &fn) const +stat union_map::foreach_map(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_union_map *arg_0, void *arg_1) -> isl_stat { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_map *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); + auto ret = (data->func)(manage(arg_0)); return ret.release(); }; - auto res = isl_union_map_list_foreach(get(), fn_lambda, &fn_data); + auto res = isl_union_map_foreach_map(get(), fn_lambda, &fn_data); return manage(res); } -isl::union_map_list union_map_list::from_union_map(isl::union_map el) +isl::union_map union_map::from(isl::multi_union_pw_aff mupa) { - auto res = isl_union_map_list_from_union_map(el.release()); + auto res = isl_union_map_from_multi_union_pw_aff(mupa.release()); return manage(res); } -isl::union_map union_map_list::get_at(int index) const +isl::union_map union_map::from(isl::union_pw_multi_aff upma) { - auto res = isl_union_map_list_get_at(get(), index); + auto res = isl_union_map_from_union_pw_multi_aff(upma.release()); return manage(res); } -isl::union_map union_map_list::get_union_map(int index) const +isl::union_map union_map::from_domain(isl::union_set uset) { - auto res = isl_union_map_list_get_union_map(get(), index); + auto res = isl_union_map_from_domain(uset.release()); return manage(res); } -isl::union_map_list union_map_list::insert(unsigned int pos, isl::union_map el) const +isl::union_map union_map::from_domain_and_range(isl::union_set domain, isl::union_set range) { - auto res = isl_union_map_list_insert(copy(), pos, el.release()); + auto res = isl_union_map_from_domain_and_range(domain.release(), range.release()); return manage(res); } -isl_size union_map_list::n_union_map() const +isl::union_map union_map::from_range(isl::union_set uset) { - auto res = isl_union_map_list_n_union_map(get()); - return res; + auto res = isl_union_map_from_range(uset.release()); + return manage(res); } -isl::union_map_list union_map_list::reverse() const +isl::union_map union_map::gist(isl::union_map context) const { - auto res = isl_union_map_list_reverse(copy()); + auto res = isl_union_map_gist(copy(), context.release()); return manage(res); } -isl::union_map_list union_map_list::set_union_map(int index, isl::union_map el) const +isl::union_map union_map::gist_domain(isl::union_set uset) const { - auto res = isl_union_map_list_set_union_map(copy(), index, el.release()); + auto res = isl_union_map_gist_domain(copy(), uset.release()); return manage(res); } -isl_size union_map_list::size() const +isl::union_map union_map::gist_params(isl::set set) const { - auto res = isl_union_map_list_size(get()); - return res; + auto res = isl_union_map_gist_params(copy(), set.release()); + return manage(res); } -isl::union_map_list union_map_list::swap(unsigned int pos1, unsigned int pos2) const +isl::union_map union_map::gist_range(isl::union_set uset) const { - auto res = isl_union_map_list_swap(copy(), pos1, pos2); + auto res = isl_union_map_gist_range(copy(), uset.release()); return manage(res); } -// implementations for isl::union_pw_aff -union_pw_aff manage(__isl_take isl_union_pw_aff *ptr) { - return union_pw_aff(ptr); -} -union_pw_aff manage_copy(__isl_keep isl_union_pw_aff *ptr) { - ptr = isl_union_pw_aff_copy(ptr); - return union_pw_aff(ptr); -} - -union_pw_aff::union_pw_aff() - : ptr(nullptr) {} - -union_pw_aff::union_pw_aff(const union_pw_aff &obj) - : ptr(nullptr) +isl::union_map union_map::intersect(isl::union_map umap2) const { - ptr = obj.copy(); + auto res = isl_union_map_intersect(copy(), umap2.release()); + return manage(res); } - -union_pw_aff::union_pw_aff(__isl_take isl_union_pw_aff *ptr) - : ptr(ptr) {} - -union_pw_aff::union_pw_aff(isl::aff aff) -{ - auto res = isl_union_pw_aff_from_aff(aff.release()); - ptr = res; -} -union_pw_aff::union_pw_aff(isl::pw_aff pa) -{ - auto res = isl_union_pw_aff_from_pw_aff(pa.release()); - ptr = res; -} -union_pw_aff::union_pw_aff(isl::ctx ctx, const std::string &str) -{ - auto res = isl_union_pw_aff_read_from_str(ctx.release(), str.c_str()); - ptr = res; -} -union_pw_aff::union_pw_aff(isl::union_set domain, isl::val v) +isl::union_map union_map::intersect_domain(isl::space space) const { - auto res = isl_union_pw_aff_val_on_domain(domain.release(), v.release()); - ptr = res; -} - -union_pw_aff &union_pw_aff::operator=(union_pw_aff obj) { - std::swap(this->ptr, obj.ptr); - return *this; + auto res = isl_union_map_intersect_domain_space(copy(), space.release()); + return manage(res); } -union_pw_aff::~union_pw_aff() { - if (ptr) - isl_union_pw_aff_free(ptr); +isl::union_map union_map::intersect_domain(isl::union_set uset) const +{ + auto res = isl_union_map_intersect_domain_union_set(copy(), uset.release()); + return manage(res); } -__isl_give isl_union_pw_aff *union_pw_aff::copy() const & { - return isl_union_pw_aff_copy(ptr); +isl::union_map union_map::intersect_domain_factor_domain(isl::union_map factor) const +{ + auto res = isl_union_map_intersect_domain_factor_domain(copy(), factor.release()); + return manage(res); } -__isl_keep isl_union_pw_aff *union_pw_aff::get() const { - return ptr; +isl::union_map union_map::intersect_domain_factor_range(isl::union_map factor) const +{ + auto res = isl_union_map_intersect_domain_factor_range(copy(), factor.release()); + return manage(res); } -__isl_give isl_union_pw_aff *union_pw_aff::release() { - isl_union_pw_aff *tmp = ptr; - ptr = nullptr; - return tmp; +isl::union_map union_map::intersect_params(isl::set set) const +{ + auto res = isl_union_map_intersect_params(copy(), set.release()); + return manage(res); } -bool union_pw_aff::is_null() const { - return ptr == nullptr; +isl::union_map union_map::intersect_range(isl::space space) const +{ + auto res = isl_union_map_intersect_range_space(copy(), space.release()); + return manage(res); } - -isl::ctx union_pw_aff::ctx() const { - return isl::ctx(isl_union_pw_aff_get_ctx(ptr)); +isl::union_map union_map::intersect_range(isl::union_set uset) const +{ + auto res = isl_union_map_intersect_range_union_set(copy(), uset.release()); + return manage(res); } -void union_pw_aff::dump() const { - isl_union_pw_aff_dump(get()); +isl::union_map union_map::intersect_range_factor_domain(isl::union_map factor) const +{ + auto res = isl_union_map_intersect_range_factor_domain(copy(), factor.release()); + return manage(res); } - -isl::union_pw_aff union_pw_aff::add(isl::union_pw_aff upa2) const +isl::union_map union_map::intersect_range_factor_range(isl::union_map factor) const { - auto res = isl_union_pw_aff_add(copy(), upa2.release()); + auto res = isl_union_map_intersect_range_factor_range(copy(), factor.release()); return manage(res); } -isl::union_pw_aff union_pw_aff::add_pw_aff(isl::pw_aff pa) const +boolean union_map::is_bijective() const { - auto res = isl_union_pw_aff_add_pw_aff(copy(), pa.release()); + auto res = isl_union_map_is_bijective(get()); return manage(res); } -isl::union_pw_aff union_pw_aff::aff_on_domain(isl::union_set domain, isl::aff aff) +boolean union_map::is_disjoint(const isl::union_map &umap2) const { - auto res = isl_union_pw_aff_aff_on_domain(domain.release(), aff.release()); + auto res = isl_union_map_is_disjoint(get(), umap2.get()); return manage(res); } -isl::union_pw_aff union_pw_aff::align_params(isl::space model) const +boolean union_map::is_empty() const { - auto res = isl_union_pw_aff_align_params(copy(), model.release()); + auto res = isl_union_map_is_empty(get()); return manage(res); } -isl::union_set union_pw_aff::bind(isl::id id) const +boolean union_map::is_equal(const isl::union_map &umap2) const { - auto res = isl_union_pw_aff_bind_id(copy(), id.release()); + auto res = isl_union_map_is_equal(get(), umap2.get()); return manage(res); } -isl::union_pw_aff union_pw_aff::coalesce() const +boolean union_map::is_injective() const { - auto res = isl_union_pw_aff_coalesce(copy()); + auto res = isl_union_map_is_injective(get()); return manage(res); } -isl_size union_pw_aff::dim(isl::dim type) const +boolean union_map::is_single_valued() const { - auto res = isl_union_pw_aff_dim(get(), static_cast(type)); - return res; + auto res = isl_union_map_is_single_valued(get()); + return manage(res); } -isl::union_set union_pw_aff::domain() const +boolean union_map::is_strict_subset(const isl::union_map &umap2) const { - auto res = isl_union_pw_aff_domain(copy()); + auto res = isl_union_map_is_strict_subset(get(), umap2.get()); return manage(res); } -isl::union_pw_aff union_pw_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +boolean union_map::is_subset(const isl::union_map &umap2) const { - auto res = isl_union_pw_aff_drop_dims(copy(), static_cast(type), first, n); + auto res = isl_union_map_is_subset(get(), umap2.get()); return manage(res); } -isl::union_pw_aff union_pw_aff::empty(isl::space space) +boolean union_map::isa_map() const { - auto res = isl_union_pw_aff_empty(space.release()); + auto res = isl_union_map_isa_map(get()); return manage(res); } -isl::union_pw_aff union_pw_aff::empty_ctx(isl::ctx ctx) +isl::union_map union_map::lexmax() const { - auto res = isl_union_pw_aff_empty_ctx(ctx.release()); + auto res = isl_union_map_lexmax(copy()); return manage(res); } -isl::union_pw_aff union_pw_aff::empty_space(isl::space space) +isl::union_map union_map::lexmin() const { - auto res = isl_union_pw_aff_empty_space(space.release()); + auto res = isl_union_map_lexmin(copy()); return manage(res); } -isl::pw_aff union_pw_aff::extract_pw_aff(isl::space space) const +isl::map_list union_map::map_list() const { - auto res = isl_union_pw_aff_extract_pw_aff(get(), space.release()); + auto res = isl_union_map_get_map_list(get()); return manage(res); } -int union_pw_aff::find_dim_by_name(isl::dim type, const std::string &name) const +isl::map_list union_map::get_map_list() const { - auto res = isl_union_pw_aff_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + return map_list(); } -isl::union_pw_aff union_pw_aff::floor() const +isl::set union_map::params() const { - auto res = isl_union_pw_aff_floor(copy()); + auto res = isl_union_map_params(copy()); return manage(res); } -stat union_pw_aff::foreach_pw_aff(const std::function &fn) const +isl::union_map union_map::polyhedral_hull() const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_pw_aff *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_union_pw_aff_foreach_pw_aff(get(), fn_lambda, &fn_data); + auto res = isl_union_map_polyhedral_hull(copy()); return manage(res); } -isl::pw_aff_list union_pw_aff::get_pw_aff_list() const +isl::union_map union_map::preimage_domain(isl::multi_aff ma) const { - auto res = isl_union_pw_aff_get_pw_aff_list(get()); + auto res = isl_union_map_preimage_domain_multi_aff(copy(), ma.release()); return manage(res); } -isl::space union_pw_aff::get_space() const +isl::union_map union_map::preimage_domain(isl::multi_pw_aff mpa) const { - auto res = isl_union_pw_aff_get_space(get()); + auto res = isl_union_map_preimage_domain_multi_pw_aff(copy(), mpa.release()); return manage(res); } -isl::union_pw_aff union_pw_aff::gist(isl::union_set context) const +isl::union_map union_map::preimage_domain(isl::pw_multi_aff pma) const { - auto res = isl_union_pw_aff_gist(copy(), context.release()); + auto res = isl_union_map_preimage_domain_pw_multi_aff(copy(), pma.release()); return manage(res); } -isl::union_pw_aff union_pw_aff::gist_params(isl::set context) const +isl::union_map union_map::preimage_domain(isl::union_pw_multi_aff upma) const { - auto res = isl_union_pw_aff_gist_params(copy(), context.release()); + auto res = isl_union_map_preimage_domain_union_pw_multi_aff(copy(), upma.release()); return manage(res); } -isl::union_pw_aff union_pw_aff::intersect_domain(isl::space space) const +isl::union_map union_map::preimage_range(isl::multi_aff ma) const { - auto res = isl_union_pw_aff_intersect_domain_space(copy(), space.release()); + auto res = isl_union_map_preimage_range_multi_aff(copy(), ma.release()); return manage(res); } -isl::union_pw_aff union_pw_aff::intersect_domain(isl::union_set uset) const +isl::union_map union_map::preimage_range(isl::pw_multi_aff pma) const { - auto res = isl_union_pw_aff_intersect_domain_union_set(copy(), uset.release()); + auto res = isl_union_map_preimage_range_pw_multi_aff(copy(), pma.release()); return manage(res); } -isl::union_pw_aff union_pw_aff::intersect_domain_wrapped_domain(isl::union_set uset) const +isl::union_map union_map::preimage_range(isl::union_pw_multi_aff upma) const { - auto res = isl_union_pw_aff_intersect_domain_wrapped_domain(copy(), uset.release()); + auto res = isl_union_map_preimage_range_union_pw_multi_aff(copy(), upma.release()); return manage(res); } -isl::union_pw_aff union_pw_aff::intersect_domain_wrapped_range(isl::union_set uset) const +isl::union_map union_map::product(isl::union_map umap2) const { - auto res = isl_union_pw_aff_intersect_domain_wrapped_range(copy(), uset.release()); + auto res = isl_union_map_product(copy(), umap2.release()); return manage(res); } -isl::union_pw_aff union_pw_aff::intersect_params(isl::set set) const +isl::union_map union_map::project_out_all_params() const { - auto res = isl_union_pw_aff_intersect_params(copy(), set.release()); + auto res = isl_union_map_project_out_all_params(copy()); return manage(res); } -boolean union_pw_aff::involves_nan() const +isl::union_set union_map::range() const { - auto res = isl_union_pw_aff_involves_nan(get()); + auto res = isl_union_map_range(copy()); return manage(res); } -isl::val union_pw_aff::max_val() const +isl::union_map union_map::range_factor_domain() const { - auto res = isl_union_pw_aff_max_val(copy()); + auto res = isl_union_map_range_factor_domain(copy()); return manage(res); } -isl::val union_pw_aff::min_val() const +isl::union_map union_map::range_factor_range() const { - auto res = isl_union_pw_aff_min_val(copy()); + auto res = isl_union_map_range_factor_range(copy()); return manage(res); } -isl::union_pw_aff union_pw_aff::mod_val(isl::val f) const +isl::union_map union_map::range_map() const { - auto res = isl_union_pw_aff_mod_val(copy(), f.release()); + auto res = isl_union_map_range_map(copy()); return manage(res); } -isl_size union_pw_aff::n_pw_aff() const +isl::union_map union_map::range_product(isl::union_map umap2) const { - auto res = isl_union_pw_aff_n_pw_aff(get()); - return res; + auto res = isl_union_map_range_product(copy(), umap2.release()); + return manage(res); } -isl::union_pw_aff union_pw_aff::neg() const +isl::union_map union_map::range_reverse() const { - auto res = isl_union_pw_aff_neg(copy()); + auto res = isl_union_map_range_reverse(copy()); return manage(res); } -isl::union_pw_aff union_pw_aff::param_on_domain_id(isl::union_set domain, isl::id id) +isl::union_map union_map::reverse() const { - auto res = isl_union_pw_aff_param_on_domain_id(domain.release(), id.release()); + auto res = isl_union_map_reverse(copy()); return manage(res); } -boolean union_pw_aff::plain_is_equal(const isl::union_pw_aff &upa2) const +isl::space union_map::space() const { - auto res = isl_union_pw_aff_plain_is_equal(get(), upa2.get()); + auto res = isl_union_map_get_space(get()); return manage(res); } -isl::union_pw_aff union_pw_aff::pullback(isl::union_pw_multi_aff upma) const +isl::space union_map::get_space() const { - auto res = isl_union_pw_aff_pullback_union_pw_multi_aff(copy(), upma.release()); - return manage(res); + return space(); } -isl::union_pw_aff union_pw_aff::pw_aff_on_domain(isl::union_set domain, isl::pw_aff pa) +isl::union_map union_map::subtract(isl::union_map umap2) const { - auto res = isl_union_pw_aff_pw_aff_on_domain(domain.release(), pa.release()); + auto res = isl_union_map_subtract(copy(), umap2.release()); return manage(res); } -isl::union_pw_aff union_pw_aff::reset_user() const +isl::union_map union_map::subtract_domain(isl::union_set dom) const { - auto res = isl_union_pw_aff_reset_user(copy()); + auto res = isl_union_map_subtract_domain(copy(), dom.release()); return manage(res); } -isl::union_pw_aff union_pw_aff::scale_down_val(isl::val v) const +isl::union_map union_map::subtract_range(isl::union_set dom) const { - auto res = isl_union_pw_aff_scale_down_val(copy(), v.release()); + auto res = isl_union_map_subtract_range(copy(), dom.release()); return manage(res); } -isl::union_pw_aff union_pw_aff::scale_val(isl::val v) const +isl::union_map union_map::uncurry() const { - auto res = isl_union_pw_aff_scale_val(copy(), v.release()); + auto res = isl_union_map_uncurry(copy()); return manage(res); } -isl::union_pw_aff union_pw_aff::sub(isl::union_pw_aff upa2) const +isl::union_map union_map::unite(isl::union_map umap2) const { - auto res = isl_union_pw_aff_sub(copy(), upa2.release()); + auto res = isl_union_map_union(copy(), umap2.release()); return manage(res); } -isl::union_pw_aff union_pw_aff::subtract_domain(isl::space space) const +isl::union_map union_map::universe() const { - auto res = isl_union_pw_aff_subtract_domain_space(copy(), space.release()); + auto res = isl_union_map_universe(copy()); return manage(res); } -isl::union_pw_aff union_pw_aff::subtract_domain(isl::union_set uset) const +isl::union_set union_map::wrap() const { - auto res = isl_union_pw_aff_subtract_domain_union_set(copy(), uset.release()); + auto res = isl_union_map_wrap(copy()); return manage(res); } -isl::union_pw_aff union_pw_aff::union_add(isl::union_pw_aff upa2) const +isl::union_map union_map::zip() const { - auto res = isl_union_pw_aff_union_add(copy(), upa2.release()); + auto res = isl_union_map_zip(copy()); return manage(res); } -isl::union_set union_pw_aff::zero_union_set() const +inline std::ostream &operator<<(std::ostream &os, const union_map &obj) { - auto res = isl_union_pw_aff_zero_union_set(copy()); - return manage(res); + char *str = isl_union_map_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -// implementations for isl::union_pw_aff_list -union_pw_aff_list manage(__isl_take isl_union_pw_aff_list *ptr) { - return union_pw_aff_list(ptr); +// implementations for isl::union_pw_aff +union_pw_aff manage(__isl_take isl_union_pw_aff *ptr) { + return union_pw_aff(ptr); } -union_pw_aff_list manage_copy(__isl_keep isl_union_pw_aff_list *ptr) { - ptr = isl_union_pw_aff_list_copy(ptr); - return union_pw_aff_list(ptr); +union_pw_aff manage_copy(__isl_keep isl_union_pw_aff *ptr) { + ptr = isl_union_pw_aff_copy(ptr); + return union_pw_aff(ptr); } -union_pw_aff_list::union_pw_aff_list() +union_pw_aff::union_pw_aff() : ptr(nullptr) {} -union_pw_aff_list::union_pw_aff_list(const union_pw_aff_list &obj) +union_pw_aff::union_pw_aff(const union_pw_aff &obj) : ptr(nullptr) { ptr = obj.copy(); } - -union_pw_aff_list::union_pw_aff_list(__isl_take isl_union_pw_aff_list *ptr) +union_pw_aff::union_pw_aff(__isl_take isl_union_pw_aff *ptr) : ptr(ptr) {} +union_pw_aff::union_pw_aff(isl::aff aff) +{ + auto res = isl_union_pw_aff_from_aff(aff.release()); + ptr = res; +} -union_pw_aff_list &union_pw_aff_list::operator=(union_pw_aff_list obj) { +union_pw_aff::union_pw_aff(isl::pw_aff pa) +{ + auto res = isl_union_pw_aff_from_pw_aff(pa.release()); + ptr = res; +} + +union_pw_aff::union_pw_aff(isl::ctx ctx, const std::string &str) +{ + auto res = isl_union_pw_aff_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} + +union_pw_aff::union_pw_aff(isl::union_set domain, isl::val v) +{ + auto res = isl_union_pw_aff_val_on_domain(domain.release(), v.release()); + ptr = res; +} + +union_pw_aff &union_pw_aff::operator=(union_pw_aff obj) { std::swap(this->ptr, obj.ptr); return *this; } -union_pw_aff_list::~union_pw_aff_list() { +union_pw_aff::~union_pw_aff() { if (ptr) - isl_union_pw_aff_list_free(ptr); + isl_union_pw_aff_free(ptr); } -__isl_give isl_union_pw_aff_list *union_pw_aff_list::copy() const & { - return isl_union_pw_aff_list_copy(ptr); +__isl_give isl_union_pw_aff *union_pw_aff::copy() const & { + return isl_union_pw_aff_copy(ptr); } -__isl_keep isl_union_pw_aff_list *union_pw_aff_list::get() const { +__isl_keep isl_union_pw_aff *union_pw_aff::get() const { return ptr; } -__isl_give isl_union_pw_aff_list *union_pw_aff_list::release() { - isl_union_pw_aff_list *tmp = ptr; +__isl_give isl_union_pw_aff *union_pw_aff::release() { + isl_union_pw_aff *tmp = ptr; ptr = nullptr; return tmp; } -bool union_pw_aff_list::is_null() const { +bool union_pw_aff::is_null() const { return ptr == nullptr; } - -isl::ctx union_pw_aff_list::ctx() const { - return isl::ctx(isl_union_pw_aff_list_get_ctx(ptr)); -} - -void union_pw_aff_list::dump() const { - isl_union_pw_aff_list_dump(get()); +isl::ctx union_pw_aff::ctx() const { + return isl::ctx(isl_union_pw_aff_get_ctx(ptr)); } - -isl::union_pw_aff_list union_pw_aff_list::add(isl::union_pw_aff el) const +isl::multi_union_pw_aff union_pw_aff::add(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_union_pw_aff_list_add(copy(), el.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).add(multi2); } -isl::union_pw_aff_list union_pw_aff_list::alloc(isl::ctx ctx, int n) +isl::union_pw_aff union_pw_aff::add(isl::union_pw_aff upa2) const { - auto res = isl_union_pw_aff_list_alloc(ctx.release(), n); + auto res = isl_union_pw_aff_add(copy(), upa2.release()); return manage(res); } -isl::union_pw_aff_list union_pw_aff_list::clear() const +isl::union_pw_multi_aff union_pw_aff::add(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_union_pw_aff_list_clear(copy()); - return manage(res); + return isl::union_pw_multi_aff(*this).add(upma2); } -isl::union_pw_aff_list union_pw_aff_list::concat(isl::union_pw_aff_list list2) const +isl::union_pw_aff union_pw_aff::add(const isl::aff &upa2) const { - auto res = isl_union_pw_aff_list_concat(copy(), list2.release()); - return manage(res); + return this->add(isl::union_pw_aff(upa2)); } -isl::union_pw_aff_list union_pw_aff_list::drop(unsigned int first, unsigned int n) const +isl::union_pw_aff union_pw_aff::add(const isl::pw_aff &upa2) const { - auto res = isl_union_pw_aff_list_drop(copy(), first, n); - return manage(res); + return this->add(isl::union_pw_aff(upa2)); } -stat union_pw_aff_list::foreach(const std::function &fn) const +isl::union_pw_multi_aff union_pw_aff::add_pw_multi_aff(const isl::pw_multi_aff &pma) const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_union_pw_aff *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_union_pw_aff_list_foreach(get(), fn_lambda, &fn_data); - return manage(res); + return isl::union_pw_multi_aff(*this).add_pw_multi_aff(pma); } -isl::union_pw_aff_list union_pw_aff_list::from_union_pw_aff(isl::union_pw_aff el) +isl::union_pw_multi_aff union_pw_aff::apply(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_union_pw_aff_list_from_union_pw_aff(el.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).apply(upma2); } -isl::union_pw_aff union_pw_aff_list::get_at(int index) const +isl::multi_union_pw_aff union_pw_aff::as_multi_union_pw_aff() const { - auto res = isl_union_pw_aff_list_get_at(get(), index); - return manage(res); + return isl::union_pw_multi_aff(*this).as_multi_union_pw_aff(); } -isl::union_pw_aff union_pw_aff_list::get_union_pw_aff(int index) const +isl::pw_multi_aff union_pw_aff::as_pw_multi_aff() const { - auto res = isl_union_pw_aff_list_get_union_pw_aff(get(), index); - return manage(res); + return isl::union_pw_multi_aff(*this).as_pw_multi_aff(); } -isl::union_pw_aff_list union_pw_aff_list::insert(unsigned int pos, isl::union_pw_aff el) const +isl::union_map union_pw_aff::as_union_map() const { - auto res = isl_union_pw_aff_list_insert(copy(), pos, el.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).as_union_map(); } -isl_size union_pw_aff_list::n_union_pw_aff() const +isl::union_pw_aff union_pw_aff::at(int pos) const { - auto res = isl_union_pw_aff_list_n_union_pw_aff(get()); - return res; + return isl::multi_union_pw_aff(*this).at(pos); } -isl::union_pw_aff_list union_pw_aff_list::reverse() const +isl::union_set union_pw_aff::bind(const isl::multi_id &tuple) const { - auto res = isl_union_pw_aff_list_reverse(copy()); - return manage(res); + return isl::multi_union_pw_aff(*this).bind(tuple); } -isl::union_pw_aff_list union_pw_aff_list::set_union_pw_aff(int index, isl::union_pw_aff el) const +isl::union_set union_pw_aff::bind(isl::id id) const { - auto res = isl_union_pw_aff_list_set_union_pw_aff(copy(), index, el.release()); + auto res = isl_union_pw_aff_bind_id(copy(), id.release()); return manage(res); } -isl_size union_pw_aff_list::size() const +isl::union_set union_pw_aff::bind(const std::string &id) const { - auto res = isl_union_pw_aff_list_size(get()); - return res; + return this->bind(isl::id(ctx(), id)); } -isl::union_pw_aff_list union_pw_aff_list::swap(unsigned int pos1, unsigned int pos2) const +isl::union_pw_aff union_pw_aff::coalesce() const { - auto res = isl_union_pw_aff_list_swap(copy(), pos1, pos2); + auto res = isl_union_pw_aff_coalesce(copy()); return manage(res); } -// implementations for isl::union_pw_multi_aff -union_pw_multi_aff manage(__isl_take isl_union_pw_multi_aff *ptr) { - return union_pw_multi_aff(ptr); -} -union_pw_multi_aff manage_copy(__isl_keep isl_union_pw_multi_aff *ptr) { - ptr = isl_union_pw_multi_aff_copy(ptr); - return union_pw_multi_aff(ptr); -} - -union_pw_multi_aff::union_pw_multi_aff() - : ptr(nullptr) {} - -union_pw_multi_aff::union_pw_multi_aff(const union_pw_multi_aff &obj) - : ptr(nullptr) +class size union_pw_aff::dim(isl::dim type) const { - ptr = obj.copy(); + return isl::multi_union_pw_aff(*this).dim(type); } - -union_pw_multi_aff::union_pw_multi_aff(__isl_take isl_union_pw_multi_aff *ptr) - : ptr(ptr) {} - -union_pw_multi_aff::union_pw_multi_aff(isl::aff aff) -{ - auto res = isl_union_pw_multi_aff_from_aff(aff.release()); - ptr = res; -} -union_pw_multi_aff::union_pw_multi_aff(isl::union_set uset) -{ - auto res = isl_union_pw_multi_aff_from_domain(uset.release()); - ptr = res; -} -union_pw_multi_aff::union_pw_multi_aff(isl::multi_aff ma) -{ - auto res = isl_union_pw_multi_aff_from_multi_aff(ma.release()); - ptr = res; -} -union_pw_multi_aff::union_pw_multi_aff(isl::multi_union_pw_aff mupa) -{ - auto res = isl_union_pw_multi_aff_from_multi_union_pw_aff(mupa.release()); - ptr = res; -} -union_pw_multi_aff::union_pw_multi_aff(isl::pw_multi_aff pma) -{ - auto res = isl_union_pw_multi_aff_from_pw_multi_aff(pma.release()); - ptr = res; -} -union_pw_multi_aff::union_pw_multi_aff(isl::union_map umap) -{ - auto res = isl_union_pw_multi_aff_from_union_map(umap.release()); - ptr = res; -} -union_pw_multi_aff::union_pw_multi_aff(isl::union_pw_aff upa) -{ - auto res = isl_union_pw_multi_aff_from_union_pw_aff(upa.release()); - ptr = res; -} -union_pw_multi_aff::union_pw_multi_aff(isl::ctx ctx, const std::string &str) +isl::union_set union_pw_aff::domain() const { - auto res = isl_union_pw_multi_aff_read_from_str(ctx.release(), str.c_str()); - ptr = res; + auto res = isl_union_pw_aff_domain(copy()); + return manage(res); } -union_pw_multi_aff &union_pw_multi_aff::operator=(union_pw_multi_aff obj) { - std::swap(this->ptr, obj.ptr); - return *this; +isl::union_pw_aff union_pw_aff::empty(isl::space space) +{ + auto res = isl_union_pw_aff_empty(space.release()); + return manage(res); } -union_pw_multi_aff::~union_pw_multi_aff() { - if (ptr) - isl_union_pw_multi_aff_free(ptr); +isl::pw_multi_aff union_pw_aff::extract_pw_multi_aff(const isl::space &space) const +{ + return isl::union_pw_multi_aff(*this).extract_pw_multi_aff(space); } -__isl_give isl_union_pw_multi_aff *union_pw_multi_aff::copy() const & { - return isl_union_pw_multi_aff_copy(ptr); +isl::multi_union_pw_aff union_pw_aff::flat_range_product(const isl::multi_union_pw_aff &multi2) const +{ + return isl::multi_union_pw_aff(*this).flat_range_product(multi2); } -__isl_keep isl_union_pw_multi_aff *union_pw_multi_aff::get() const { - return ptr; +isl::union_pw_multi_aff union_pw_aff::flat_range_product(const isl::union_pw_multi_aff &upma2) const +{ + return isl::union_pw_multi_aff(*this).flat_range_product(upma2); } -__isl_give isl_union_pw_multi_aff *union_pw_multi_aff::release() { - isl_union_pw_multi_aff *tmp = ptr; - ptr = nullptr; - return tmp; +stat union_pw_aff::foreach_pw_aff(const std::function &fn) const +{ + struct fn_data { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_pw_aff *arg_0, void *arg_1) -> isl_stat { + auto *data = static_cast(arg_1); + auto ret = (data->func)(manage(arg_0)); + return ret.release(); + }; + auto res = isl_union_pw_aff_foreach_pw_aff(get(), fn_lambda, &fn_data); + return manage(res); } -bool union_pw_multi_aff::is_null() const { - return ptr == nullptr; +isl::union_pw_aff union_pw_aff::gist(isl::union_set context) const +{ + auto res = isl_union_pw_aff_gist(copy(), context.release()); + return manage(res); } - -isl::ctx union_pw_multi_aff::ctx() const { - return isl::ctx(isl_union_pw_multi_aff_get_ctx(ptr)); +boolean union_pw_aff::has_range_tuple_id() const +{ + return isl::multi_union_pw_aff(*this).has_range_tuple_id(); } -void union_pw_multi_aff::dump() const { - isl_union_pw_multi_aff_dump(get()); +isl::union_pw_aff union_pw_aff::intersect_domain(isl::space space) const +{ + auto res = isl_union_pw_aff_intersect_domain_space(copy(), space.release()); + return manage(res); } - -isl::union_pw_multi_aff union_pw_multi_aff::add(isl::union_pw_multi_aff upma2) const +isl::union_pw_aff union_pw_aff::intersect_domain(isl::union_set uset) const { - auto res = isl_union_pw_multi_aff_add(copy(), upma2.release()); + auto res = isl_union_pw_aff_intersect_domain_union_set(copy(), uset.release()); return manage(res); } -isl::union_pw_multi_aff union_pw_multi_aff::add_pw_multi_aff(isl::pw_multi_aff pma) const +isl::union_pw_aff union_pw_aff::intersect_domain_wrapped_domain(isl::union_set uset) const { - auto res = isl_union_pw_multi_aff_add_pw_multi_aff(copy(), pma.release()); + auto res = isl_union_pw_aff_intersect_domain_wrapped_domain(copy(), uset.release()); return manage(res); } -isl::union_pw_multi_aff union_pw_multi_aff::align_params(isl::space model) const +isl::union_pw_aff union_pw_aff::intersect_domain_wrapped_range(isl::union_set uset) const { - auto res = isl_union_pw_multi_aff_align_params(copy(), model.release()); + auto res = isl_union_pw_aff_intersect_domain_wrapped_range(copy(), uset.release()); return manage(res); } -isl::union_pw_multi_aff union_pw_multi_aff::apply(isl::union_pw_multi_aff upma2) const +isl::union_pw_aff union_pw_aff::intersect_params(isl::set set) const { - auto res = isl_union_pw_multi_aff_apply_union_pw_multi_aff(copy(), upma2.release()); + auto res = isl_union_pw_aff_intersect_params(copy(), set.release()); return manage(res); } -isl::pw_multi_aff union_pw_multi_aff::as_pw_multi_aff() const +boolean union_pw_aff::involves_locals() const { - auto res = isl_union_pw_multi_aff_as_pw_multi_aff(copy()); - return manage(res); + return isl::union_pw_multi_aff(*this).involves_locals(); } -isl::union_pw_multi_aff union_pw_multi_aff::coalesce() const +boolean union_pw_aff::involves_nan() const { - auto res = isl_union_pw_multi_aff_coalesce(copy()); - return manage(res); + return isl::multi_union_pw_aff(*this).involves_nan(); } -isl_size union_pw_multi_aff::dim(isl::dim type) const +boolean union_pw_aff::isa_pw_multi_aff() const { - auto res = isl_union_pw_multi_aff_dim(get(), static_cast(type)); - return res; + return isl::union_pw_multi_aff(*this).isa_pw_multi_aff(); } -isl::union_set union_pw_multi_aff::domain() const +isl::union_pw_aff_list union_pw_aff::list() const { - auto res = isl_union_pw_multi_aff_domain(copy()); - return manage(res); + return isl::multi_union_pw_aff(*this).list(); } -isl::union_pw_multi_aff union_pw_multi_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::multi_union_pw_aff union_pw_aff::neg() const { - auto res = isl_union_pw_multi_aff_drop_dims(copy(), static_cast(type), first, n); - return manage(res); + return isl::multi_union_pw_aff(*this).neg(); } -isl::union_pw_multi_aff union_pw_multi_aff::empty(isl::space space) +boolean union_pw_aff::plain_is_empty() const { - auto res = isl_union_pw_multi_aff_empty(space.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).plain_is_empty(); } -isl::union_pw_multi_aff union_pw_multi_aff::empty(isl::ctx ctx) +boolean union_pw_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_union_pw_multi_aff_empty_ctx(ctx.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).plain_is_equal(multi2); } -isl::union_pw_multi_aff union_pw_multi_aff::empty_space(isl::space space) +isl::union_pw_multi_aff union_pw_aff::preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_union_pw_multi_aff_empty_space(space.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).preimage_domain_wrapped_domain(upma2); } -isl::pw_multi_aff union_pw_multi_aff::extract_pw_multi_aff(isl::space space) const +isl::union_pw_aff union_pw_aff::pullback(isl::union_pw_multi_aff upma) const { - auto res = isl_union_pw_multi_aff_extract_pw_multi_aff(get(), space.release()); + auto res = isl_union_pw_aff_pullback_union_pw_multi_aff(copy(), upma.release()); return manage(res); } -int union_pw_multi_aff::find_dim_by_name(isl::dim type, const std::string &name) const +isl::pw_multi_aff_list union_pw_aff::pw_multi_aff_list() const { - auto res = isl_union_pw_multi_aff_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; + return isl::union_pw_multi_aff(*this).pw_multi_aff_list(); } -isl::union_pw_multi_aff union_pw_multi_aff::flat_range_product(isl::union_pw_multi_aff upma2) const +isl::union_pw_multi_aff union_pw_aff::range_factor_domain() const { - auto res = isl_union_pw_multi_aff_flat_range_product(copy(), upma2.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).range_factor_domain(); } -stat union_pw_multi_aff::foreach_pw_multi_aff(const std::function &fn) const +isl::union_pw_multi_aff union_pw_aff::range_factor_range() const { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_pw_multi_aff *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_union_pw_multi_aff_foreach_pw_multi_aff(get(), fn_lambda, &fn_data); - return manage(res); + return isl::union_pw_multi_aff(*this).range_factor_range(); } -isl::union_pw_multi_aff union_pw_multi_aff::from_union_set(isl::union_set uset) +isl::multi_union_pw_aff union_pw_aff::range_product(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_union_pw_multi_aff_from_union_set(uset.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).range_product(multi2); } -isl::pw_multi_aff_list union_pw_multi_aff::get_pw_multi_aff_list() const +isl::union_pw_multi_aff union_pw_aff::range_product(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_union_pw_multi_aff_get_pw_multi_aff_list(get()); - return manage(res); + return isl::union_pw_multi_aff(*this).range_product(upma2); } -isl::space union_pw_multi_aff::get_space() const +isl::id union_pw_aff::range_tuple_id() const { - auto res = isl_union_pw_multi_aff_get_space(get()); - return manage(res); + return isl::multi_union_pw_aff(*this).range_tuple_id(); } -isl::union_pw_aff union_pw_multi_aff::get_union_pw_aff(int pos) const +isl::multi_union_pw_aff union_pw_aff::reset_range_tuple_id() const { - auto res = isl_union_pw_multi_aff_get_union_pw_aff(get(), pos); - return manage(res); + return isl::multi_union_pw_aff(*this).reset_range_tuple_id(); } -isl::union_pw_multi_aff union_pw_multi_aff::gist(isl::union_set context) const +isl::multi_union_pw_aff union_pw_aff::reset_tuple_id(isl::dim type) const { - auto res = isl_union_pw_multi_aff_gist(copy(), context.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).reset_tuple_id(type); } -isl::union_pw_multi_aff union_pw_multi_aff::gist_params(isl::set context) const +isl::multi_union_pw_aff union_pw_aff::scale(const isl::multi_val &mv) const { - auto res = isl_union_pw_multi_aff_gist_params(copy(), context.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).scale(mv); } -isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain(isl::space space) const +isl::multi_union_pw_aff union_pw_aff::scale(const isl::val &v) const { - auto res = isl_union_pw_multi_aff_intersect_domain_space(copy(), space.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).scale(v); } -isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain(isl::union_set uset) const +isl::multi_union_pw_aff union_pw_aff::scale(long v) const { - auto res = isl_union_pw_multi_aff_intersect_domain_union_set(copy(), uset.release()); - return manage(res); + return this->scale(isl::val(ctx(), v)); } -isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain_wrapped_domain(isl::union_set uset) const +isl::multi_union_pw_aff union_pw_aff::scale_down(const isl::multi_val &mv) const { - auto res = isl_union_pw_multi_aff_intersect_domain_wrapped_domain(copy(), uset.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).scale_down(mv); } -isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain_wrapped_range(isl::union_set uset) const +isl::multi_union_pw_aff union_pw_aff::scale_down(const isl::val &v) const { - auto res = isl_union_pw_multi_aff_intersect_domain_wrapped_range(copy(), uset.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).scale_down(v); } -isl::union_pw_multi_aff union_pw_multi_aff::intersect_params(isl::set set) const +isl::multi_union_pw_aff union_pw_aff::scale_down(long v) const { - auto res = isl_union_pw_multi_aff_intersect_params(copy(), set.release()); - return manage(res); + return this->scale_down(isl::val(ctx(), v)); } -boolean union_pw_multi_aff::involves_locals() const +isl::multi_union_pw_aff union_pw_aff::set_at(int pos, const isl::union_pw_aff &el) const { - auto res = isl_union_pw_multi_aff_involves_locals(get()); - return manage(res); + return isl::multi_union_pw_aff(*this).set_at(pos, el); } -boolean union_pw_multi_aff::involves_nan() const +isl::multi_union_pw_aff union_pw_aff::set_range_tuple(const isl::id &id) const { - auto res = isl_union_pw_multi_aff_involves_nan(get()); - return manage(res); + return isl::multi_union_pw_aff(*this).set_range_tuple(id); } -boolean union_pw_multi_aff::isa_pw_multi_aff() const +isl::multi_union_pw_aff union_pw_aff::set_range_tuple(const std::string &id) const { - auto res = isl_union_pw_multi_aff_isa_pw_multi_aff(get()); - return manage(res); + return this->set_range_tuple(isl::id(ctx(), id)); } -isl::union_pw_multi_aff union_pw_multi_aff::multi_val_on_domain(isl::union_set domain, isl::multi_val mv) +isl::multi_union_pw_aff union_pw_aff::set_union_pw_aff(int pos, const isl::union_pw_aff &el) const { - auto res = isl_union_pw_multi_aff_multi_val_on_domain(domain.release(), mv.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).set_union_pw_aff(pos, el); } -isl_size union_pw_multi_aff::n_pw_multi_aff() const +class size union_pw_aff::size() const { - auto res = isl_union_pw_multi_aff_n_pw_multi_aff(get()); - return res; + return isl::multi_union_pw_aff(*this).size(); } -isl::union_pw_multi_aff union_pw_multi_aff::neg() const +isl::space union_pw_aff::space() const { - auto res = isl_union_pw_multi_aff_neg(copy()); + auto res = isl_union_pw_aff_get_space(get()); return manage(res); } -boolean union_pw_multi_aff::plain_is_empty() const +isl::space union_pw_aff::get_space() const { - auto res = isl_union_pw_multi_aff_plain_is_empty(get()); - return manage(res); + return space(); } -boolean union_pw_multi_aff::plain_is_equal(const isl::union_pw_multi_aff &upma2) const +isl::multi_union_pw_aff union_pw_aff::sub(const isl::multi_union_pw_aff &multi2) const { - auto res = isl_union_pw_multi_aff_plain_is_equal(get(), upma2.get()); - return manage(res); + return isl::multi_union_pw_aff(*this).sub(multi2); } -isl::union_pw_multi_aff union_pw_multi_aff::preimage_domain_wrapped_domain(isl::union_pw_multi_aff upma2) const +isl::union_pw_aff union_pw_aff::sub(isl::union_pw_aff upa2) const { - auto res = isl_union_pw_multi_aff_preimage_domain_wrapped_domain_union_pw_multi_aff(copy(), upma2.release()); + auto res = isl_union_pw_aff_sub(copy(), upa2.release()); return manage(res); } -isl::union_pw_multi_aff union_pw_multi_aff::pullback(isl::union_pw_multi_aff upma2) const +isl::union_pw_multi_aff union_pw_aff::sub(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_union_pw_multi_aff_pullback_union_pw_multi_aff(copy(), upma2.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).sub(upma2); } -isl::union_pw_multi_aff union_pw_multi_aff::range_factor_domain() const +isl::union_pw_aff union_pw_aff::sub(const isl::aff &upa2) const { - auto res = isl_union_pw_multi_aff_range_factor_domain(copy()); - return manage(res); + return this->sub(isl::union_pw_aff(upa2)); } -isl::union_pw_multi_aff union_pw_multi_aff::range_factor_range() const +isl::union_pw_aff union_pw_aff::sub(const isl::pw_aff &upa2) const { - auto res = isl_union_pw_multi_aff_range_factor_range(copy()); - return manage(res); + return this->sub(isl::union_pw_aff(upa2)); } -isl::union_pw_multi_aff union_pw_multi_aff::range_product(isl::union_pw_multi_aff upma2) const +isl::union_pw_aff union_pw_aff::subtract_domain(isl::space space) const { - auto res = isl_union_pw_multi_aff_range_product(copy(), upma2.release()); + auto res = isl_union_pw_aff_subtract_domain_space(copy(), space.release()); return manage(res); } -isl::union_pw_multi_aff union_pw_multi_aff::reset_user() const +isl::union_pw_aff union_pw_aff::subtract_domain(isl::union_set uset) const { - auto res = isl_union_pw_multi_aff_reset_user(copy()); + auto res = isl_union_pw_aff_subtract_domain_union_set(copy(), uset.release()); return manage(res); } -isl::union_pw_multi_aff union_pw_multi_aff::scale_down_val(isl::val val) const +isl::union_pw_aff_list union_pw_aff::to_list() const { - auto res = isl_union_pw_multi_aff_scale_down_val(copy(), val.release()); + auto res = isl_union_pw_aff_to_list(copy()); return manage(res); } -isl::union_pw_multi_aff union_pw_multi_aff::scale_multi_val(isl::multi_val mv) const +isl::multi_union_pw_aff union_pw_aff::union_add(const isl::multi_union_pw_aff &mupa2) const { - auto res = isl_union_pw_multi_aff_scale_multi_val(copy(), mv.release()); - return manage(res); + return isl::multi_union_pw_aff(*this).union_add(mupa2); } -isl::union_pw_multi_aff union_pw_multi_aff::scale_val(isl::val val) const +isl::union_pw_aff union_pw_aff::union_add(isl::union_pw_aff upa2) const { - auto res = isl_union_pw_multi_aff_scale_val(copy(), val.release()); + auto res = isl_union_pw_aff_union_add(copy(), upa2.release()); return manage(res); } -isl::union_pw_multi_aff union_pw_multi_aff::sub(isl::union_pw_multi_aff upma2) const +isl::union_pw_multi_aff union_pw_aff::union_add(const isl::union_pw_multi_aff &upma2) const { - auto res = isl_union_pw_multi_aff_sub(copy(), upma2.release()); - return manage(res); + return isl::union_pw_multi_aff(*this).union_add(upma2); } -isl::union_pw_multi_aff union_pw_multi_aff::subtract_domain(isl::space space) const +isl::union_pw_aff union_pw_aff::union_add(const isl::aff &upa2) const { - auto res = isl_union_pw_multi_aff_subtract_domain_space(copy(), space.release()); - return manage(res); + return this->union_add(isl::union_pw_aff(upa2)); } -isl::union_pw_multi_aff union_pw_multi_aff::subtract_domain(isl::union_set uset) const +isl::union_pw_aff union_pw_aff::union_add(const isl::pw_aff &upa2) const { - auto res = isl_union_pw_multi_aff_subtract_domain_union_set(copy(), uset.release()); - return manage(res); + return this->union_add(isl::union_pw_aff(upa2)); } -isl::union_pw_multi_aff union_pw_multi_aff::union_add(isl::union_pw_multi_aff upma2) const +inline std::ostream &operator<<(std::ostream &os, const union_pw_aff &obj) { - auto res = isl_union_pw_multi_aff_union_add(copy(), upma2.release()); - return manage(res); + char *str = isl_union_pw_aff_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -// implementations for isl::union_pw_multi_aff_list -union_pw_multi_aff_list manage(__isl_take isl_union_pw_multi_aff_list *ptr) { - return union_pw_multi_aff_list(ptr); +// implementations for isl::union_pw_aff_list +union_pw_aff_list manage(__isl_take isl_union_pw_aff_list *ptr) { + return union_pw_aff_list(ptr); } -union_pw_multi_aff_list manage_copy(__isl_keep isl_union_pw_multi_aff_list *ptr) { - ptr = isl_union_pw_multi_aff_list_copy(ptr); - return union_pw_multi_aff_list(ptr); +union_pw_aff_list manage_copy(__isl_keep isl_union_pw_aff_list *ptr) { + ptr = isl_union_pw_aff_list_copy(ptr); + return union_pw_aff_list(ptr); } -union_pw_multi_aff_list::union_pw_multi_aff_list() +union_pw_aff_list::union_pw_aff_list() : ptr(nullptr) {} -union_pw_multi_aff_list::union_pw_multi_aff_list(const union_pw_multi_aff_list &obj) +union_pw_aff_list::union_pw_aff_list(const union_pw_aff_list &obj) : ptr(nullptr) { ptr = obj.copy(); } - -union_pw_multi_aff_list::union_pw_multi_aff_list(__isl_take isl_union_pw_multi_aff_list *ptr) +union_pw_aff_list::union_pw_aff_list(__isl_take isl_union_pw_aff_list *ptr) : ptr(ptr) {} +union_pw_aff_list::union_pw_aff_list(isl::ctx ctx, int n) +{ + auto res = isl_union_pw_aff_list_alloc(ctx.release(), n); + ptr = res; +} + +union_pw_aff_list::union_pw_aff_list(isl::union_pw_aff el) +{ + auto res = isl_union_pw_aff_list_from_union_pw_aff(el.release()); + ptr = res; +} + +union_pw_aff_list::union_pw_aff_list(isl::ctx ctx, const std::string &str) +{ + auto res = isl_union_pw_aff_list_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} -union_pw_multi_aff_list &union_pw_multi_aff_list::operator=(union_pw_multi_aff_list obj) { +union_pw_aff_list &union_pw_aff_list::operator=(union_pw_aff_list obj) { std::swap(this->ptr, obj.ptr); return *this; } -union_pw_multi_aff_list::~union_pw_multi_aff_list() { +union_pw_aff_list::~union_pw_aff_list() { if (ptr) - isl_union_pw_multi_aff_list_free(ptr); + isl_union_pw_aff_list_free(ptr); } -__isl_give isl_union_pw_multi_aff_list *union_pw_multi_aff_list::copy() const & { - return isl_union_pw_multi_aff_list_copy(ptr); +__isl_give isl_union_pw_aff_list *union_pw_aff_list::copy() const & { + return isl_union_pw_aff_list_copy(ptr); } -__isl_keep isl_union_pw_multi_aff_list *union_pw_multi_aff_list::get() const { +__isl_keep isl_union_pw_aff_list *union_pw_aff_list::get() const { return ptr; } -__isl_give isl_union_pw_multi_aff_list *union_pw_multi_aff_list::release() { - isl_union_pw_multi_aff_list *tmp = ptr; +__isl_give isl_union_pw_aff_list *union_pw_aff_list::release() { + isl_union_pw_aff_list *tmp = ptr; ptr = nullptr; return tmp; } -bool union_pw_multi_aff_list::is_null() const { +bool union_pw_aff_list::is_null() const { return ptr == nullptr; } - -isl::ctx union_pw_multi_aff_list::ctx() const { - return isl::ctx(isl_union_pw_multi_aff_list_get_ctx(ptr)); +isl::ctx union_pw_aff_list::ctx() const { + return isl::ctx(isl_union_pw_aff_list_get_ctx(ptr)); } -void union_pw_multi_aff_list::dump() const { - isl_union_pw_multi_aff_list_dump(get()); +isl::union_pw_aff_list union_pw_aff_list::add(isl::union_pw_aff el) const +{ + auto res = isl_union_pw_aff_list_add(copy(), el.release()); + return manage(res); } - -isl::union_pw_multi_aff_list union_pw_multi_aff_list::add(isl::union_pw_multi_aff el) const +isl::union_pw_aff union_pw_aff_list::at(int index) const { - auto res = isl_union_pw_multi_aff_list_add(copy(), el.release()); + auto res = isl_union_pw_aff_list_get_at(get(), index); return manage(res); } -isl::union_pw_multi_aff_list union_pw_multi_aff_list::alloc(isl::ctx ctx, int n) +isl::union_pw_aff union_pw_aff_list::get_at(int index) const { - auto res = isl_union_pw_multi_aff_list_alloc(ctx.release(), n); - return manage(res); + return at(index); } -isl::union_pw_multi_aff_list union_pw_multi_aff_list::clear() const +isl::union_pw_aff_list union_pw_aff_list::clear() const { - auto res = isl_union_pw_multi_aff_list_clear(copy()); + auto res = isl_union_pw_aff_list_clear(copy()); return manage(res); } -isl::union_pw_multi_aff_list union_pw_multi_aff_list::concat(isl::union_pw_multi_aff_list list2) const +isl::union_pw_aff_list union_pw_aff_list::concat(isl::union_pw_aff_list list2) const { - auto res = isl_union_pw_multi_aff_list_concat(copy(), list2.release()); + auto res = isl_union_pw_aff_list_concat(copy(), list2.release()); return manage(res); } -isl::union_pw_multi_aff_list union_pw_multi_aff_list::drop(unsigned int first, unsigned int n) const +isl::union_pw_aff_list union_pw_aff_list::drop(unsigned int first, unsigned int n) const { - auto res = isl_union_pw_multi_aff_list_drop(copy(), first, n); + auto res = isl_union_pw_aff_list_drop(copy(), first, n); return manage(res); } -stat union_pw_multi_aff_list::foreach(const std::function &fn) const +stat union_pw_aff_list::foreach(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_union_pw_multi_aff *arg_0, void *arg_1) -> isl_stat { + std::function func; + } fn_data = { fn }; + auto fn_lambda = [](isl_union_pw_aff *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); + auto ret = (data->func)(manage(arg_0)); return ret.release(); }; - auto res = isl_union_pw_multi_aff_list_foreach(get(), fn_lambda, &fn_data); + auto res = isl_union_pw_aff_list_foreach(get(), fn_lambda, &fn_data); return manage(res); } -isl::union_pw_multi_aff_list union_pw_multi_aff_list::from_union_pw_multi_aff(isl::union_pw_multi_aff el) +isl::union_pw_aff_list union_pw_aff_list::insert(unsigned int pos, isl::union_pw_aff el) const { - auto res = isl_union_pw_multi_aff_list_from_union_pw_multi_aff(el.release()); + auto res = isl_union_pw_aff_list_insert(copy(), pos, el.release()); return manage(res); } -isl::union_pw_multi_aff union_pw_multi_aff_list::get_at(int index) const +class size union_pw_aff_list::size() const { - auto res = isl_union_pw_multi_aff_list_get_at(get(), index); + auto res = isl_union_pw_aff_list_size(get()); return manage(res); } -isl::union_pw_multi_aff union_pw_multi_aff_list::get_union_pw_multi_aff(int index) const +inline std::ostream &operator<<(std::ostream &os, const union_pw_aff_list &obj) { - auto res = isl_union_pw_multi_aff_list_get_union_pw_multi_aff(get(), index); - return manage(res); + char *str = isl_union_pw_aff_list_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -isl::union_pw_multi_aff_list union_pw_multi_aff_list::insert(unsigned int pos, isl::union_pw_multi_aff el) const -{ - auto res = isl_union_pw_multi_aff_list_insert(copy(), pos, el.release()); - return manage(res); +// implementations for isl::union_pw_multi_aff +union_pw_multi_aff manage(__isl_take isl_union_pw_multi_aff *ptr) { + return union_pw_multi_aff(ptr); } - -isl_size union_pw_multi_aff_list::n_union_pw_multi_aff() const -{ - auto res = isl_union_pw_multi_aff_list_n_union_pw_multi_aff(get()); - return res; +union_pw_multi_aff manage_copy(__isl_keep isl_union_pw_multi_aff *ptr) { + ptr = isl_union_pw_multi_aff_copy(ptr); + return union_pw_multi_aff(ptr); } -isl::union_pw_multi_aff_list union_pw_multi_aff_list::reverse() const +union_pw_multi_aff::union_pw_multi_aff() + : ptr(nullptr) {} + +union_pw_multi_aff::union_pw_multi_aff(const union_pw_multi_aff &obj) + : ptr(nullptr) { - auto res = isl_union_pw_multi_aff_list_reverse(copy()); - return manage(res); + ptr = obj.copy(); } -isl::union_pw_multi_aff_list union_pw_multi_aff_list::set_union_pw_multi_aff(int index, isl::union_pw_multi_aff el) const +union_pw_multi_aff::union_pw_multi_aff(__isl_take isl_union_pw_multi_aff *ptr) + : ptr(ptr) {} + +union_pw_multi_aff::union_pw_multi_aff(isl::union_set uset) { - auto res = isl_union_pw_multi_aff_list_set_union_pw_multi_aff(copy(), index, el.release()); - return manage(res); + auto res = isl_union_pw_multi_aff_from_domain(uset.release()); + ptr = res; } -isl_size union_pw_multi_aff_list::size() const +union_pw_multi_aff::union_pw_multi_aff(isl::multi_aff ma) { - auto res = isl_union_pw_multi_aff_list_size(get()); - return res; + auto res = isl_union_pw_multi_aff_from_multi_aff(ma.release()); + ptr = res; } -isl::union_pw_multi_aff_list union_pw_multi_aff_list::swap(unsigned int pos1, unsigned int pos2) const +union_pw_multi_aff::union_pw_multi_aff(isl::pw_multi_aff pma) { - auto res = isl_union_pw_multi_aff_list_swap(copy(), pos1, pos2); - return manage(res); + auto res = isl_union_pw_multi_aff_from_pw_multi_aff(pma.release()); + ptr = res; } -// implementations for isl::union_pw_qpolynomial -union_pw_qpolynomial manage(__isl_take isl_union_pw_qpolynomial *ptr) { - return union_pw_qpolynomial(ptr); -} -union_pw_qpolynomial manage_copy(__isl_keep isl_union_pw_qpolynomial *ptr) { - ptr = isl_union_pw_qpolynomial_copy(ptr); - return union_pw_qpolynomial(ptr); +union_pw_multi_aff::union_pw_multi_aff(isl::union_map umap) +{ + auto res = isl_union_pw_multi_aff_from_union_map(umap.release()); + ptr = res; } -union_pw_qpolynomial::union_pw_qpolynomial() - : ptr(nullptr) {} - -union_pw_qpolynomial::union_pw_qpolynomial(const union_pw_qpolynomial &obj) - : ptr(nullptr) +union_pw_multi_aff::union_pw_multi_aff(isl::union_pw_aff upa) { - ptr = obj.copy(); + auto res = isl_union_pw_multi_aff_from_union_pw_aff(upa.release()); + ptr = res; } - -union_pw_qpolynomial::union_pw_qpolynomial(__isl_take isl_union_pw_qpolynomial *ptr) - : ptr(ptr) {} - -union_pw_qpolynomial::union_pw_qpolynomial(isl::ctx ctx, const std::string &str) +union_pw_multi_aff::union_pw_multi_aff(isl::ctx ctx, const std::string &str) { - auto res = isl_union_pw_qpolynomial_read_from_str(ctx.release(), str.c_str()); + auto res = isl_union_pw_multi_aff_read_from_str(ctx.release(), str.c_str()); ptr = res; } -union_pw_qpolynomial &union_pw_qpolynomial::operator=(union_pw_qpolynomial obj) { +union_pw_multi_aff &union_pw_multi_aff::operator=(union_pw_multi_aff obj) { std::swap(this->ptr, obj.ptr); return *this; } -union_pw_qpolynomial::~union_pw_qpolynomial() { +union_pw_multi_aff::~union_pw_multi_aff() { if (ptr) - isl_union_pw_qpolynomial_free(ptr); + isl_union_pw_multi_aff_free(ptr); } -__isl_give isl_union_pw_qpolynomial *union_pw_qpolynomial::copy() const & { - return isl_union_pw_qpolynomial_copy(ptr); +__isl_give isl_union_pw_multi_aff *union_pw_multi_aff::copy() const & { + return isl_union_pw_multi_aff_copy(ptr); } -__isl_keep isl_union_pw_qpolynomial *union_pw_qpolynomial::get() const { +__isl_keep isl_union_pw_multi_aff *union_pw_multi_aff::get() const { return ptr; } -__isl_give isl_union_pw_qpolynomial *union_pw_qpolynomial::release() { - isl_union_pw_qpolynomial *tmp = ptr; +__isl_give isl_union_pw_multi_aff *union_pw_multi_aff::release() { + isl_union_pw_multi_aff *tmp = ptr; ptr = nullptr; return tmp; } -bool union_pw_qpolynomial::is_null() const { +bool union_pw_multi_aff::is_null() const { return ptr == nullptr; } - -isl::ctx union_pw_qpolynomial::ctx() const { - return isl::ctx(isl_union_pw_qpolynomial_get_ctx(ptr)); +isl::ctx union_pw_multi_aff::ctx() const { + return isl::ctx(isl_union_pw_multi_aff_get_ctx(ptr)); } - -isl::union_pw_qpolynomial union_pw_qpolynomial::add(isl::union_pw_qpolynomial upwqp2) const +isl::union_pw_multi_aff union_pw_multi_aff::add(isl::union_pw_multi_aff upma2) const { - auto res = isl_union_pw_qpolynomial_add(copy(), upwqp2.release()); + auto res = isl_union_pw_multi_aff_add(copy(), upma2.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::add_pw_qpolynomial(isl::pw_qpolynomial pwqp) const +isl::union_pw_multi_aff union_pw_multi_aff::add_pw_multi_aff(isl::pw_multi_aff pma) const { - auto res = isl_union_pw_qpolynomial_add_pw_qpolynomial(copy(), pwqp.release()); + auto res = isl_union_pw_multi_aff_add_pw_multi_aff(copy(), pma.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::align_params(isl::space model) const +isl::union_pw_multi_aff union_pw_multi_aff::apply(isl::union_pw_multi_aff upma2) const { - auto res = isl_union_pw_qpolynomial_align_params(copy(), model.release()); + auto res = isl_union_pw_multi_aff_apply_union_pw_multi_aff(copy(), upma2.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::coalesce() const +isl::multi_union_pw_aff union_pw_multi_aff::as_multi_union_pw_aff() const { - auto res = isl_union_pw_qpolynomial_coalesce(copy()); + auto res = isl_union_pw_multi_aff_as_multi_union_pw_aff(copy()); return manage(res); } -isl_size union_pw_qpolynomial::dim(isl::dim type) const -{ - auto res = isl_union_pw_qpolynomial_dim(get(), static_cast(type)); - return res; -} - -isl::union_set union_pw_qpolynomial::domain() const +isl::pw_multi_aff union_pw_multi_aff::as_pw_multi_aff() const { - auto res = isl_union_pw_qpolynomial_domain(copy()); + auto res = isl_union_pw_multi_aff_as_pw_multi_aff(copy()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::drop_dims(isl::dim type, unsigned int first, unsigned int n) const +isl::union_map union_pw_multi_aff::as_union_map() const { - auto res = isl_union_pw_qpolynomial_drop_dims(copy(), static_cast(type), first, n); + auto res = isl_union_pw_multi_aff_as_union_map(copy()); return manage(res); } -isl::val union_pw_qpolynomial::eval(isl::point pnt) const +isl::union_pw_multi_aff union_pw_multi_aff::coalesce() const { - auto res = isl_union_pw_qpolynomial_eval(copy(), pnt.release()); + auto res = isl_union_pw_multi_aff_coalesce(copy()); return manage(res); } -isl::pw_qpolynomial union_pw_qpolynomial::extract_pw_qpolynomial(isl::space space) const +isl::union_set union_pw_multi_aff::domain() const { - auto res = isl_union_pw_qpolynomial_extract_pw_qpolynomial(get(), space.release()); + auto res = isl_union_pw_multi_aff_domain(copy()); return manage(res); } -int union_pw_qpolynomial::find_dim_by_name(isl::dim type, const std::string &name) const -{ - auto res = isl_union_pw_qpolynomial_find_dim_by_name(get(), static_cast(type), name.c_str()); - return res; -} - -stat union_pw_qpolynomial::foreach_pw_qpolynomial(const std::function &fn) const +isl::union_pw_multi_aff union_pw_multi_aff::empty(isl::space space) { - struct fn_data { - const std::function *func; - } fn_data = { &fn }; - auto fn_lambda = [](isl_pw_qpolynomial *arg_0, void *arg_1) -> isl_stat { - auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); - return ret.release(); - }; - auto res = isl_union_pw_qpolynomial_foreach_pw_qpolynomial(get(), fn_lambda, &fn_data); + auto res = isl_union_pw_multi_aff_empty(space.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::from_pw_qpolynomial(isl::pw_qpolynomial pwqp) +isl::union_pw_multi_aff union_pw_multi_aff::empty(isl::ctx ctx) { - auto res = isl_union_pw_qpolynomial_from_pw_qpolynomial(pwqp.release()); + auto res = isl_union_pw_multi_aff_empty_ctx(ctx.release()); return manage(res); } -isl::pw_qpolynomial_list union_pw_qpolynomial::get_pw_qpolynomial_list() const +isl::pw_multi_aff union_pw_multi_aff::extract_pw_multi_aff(isl::space space) const { - auto res = isl_union_pw_qpolynomial_get_pw_qpolynomial_list(get()); + auto res = isl_union_pw_multi_aff_extract_pw_multi_aff(get(), space.release()); return manage(res); } -isl::space union_pw_qpolynomial::get_space() const +isl::union_pw_multi_aff union_pw_multi_aff::flat_range_product(isl::union_pw_multi_aff upma2) const { - auto res = isl_union_pw_qpolynomial_get_space(get()); + auto res = isl_union_pw_multi_aff_flat_range_product(copy(), upma2.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::gist(isl::union_set context) const +isl::union_pw_multi_aff union_pw_multi_aff::gist(isl::union_set context) const { - auto res = isl_union_pw_qpolynomial_gist(copy(), context.release()); + auto res = isl_union_pw_multi_aff_gist(copy(), context.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::gist_params(isl::set context) const +isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain(isl::space space) const { - auto res = isl_union_pw_qpolynomial_gist_params(copy(), context.release()); + auto res = isl_union_pw_multi_aff_intersect_domain_space(copy(), space.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::intersect_domain(isl::union_set uset) const +isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain(isl::union_set uset) const { - auto res = isl_union_pw_qpolynomial_intersect_domain(copy(), uset.release()); + auto res = isl_union_pw_multi_aff_intersect_domain_union_set(copy(), uset.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::intersect_domain_space(isl::space space) const +isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain_wrapped_domain(isl::union_set uset) const { - auto res = isl_union_pw_qpolynomial_intersect_domain_space(copy(), space.release()); + auto res = isl_union_pw_multi_aff_intersect_domain_wrapped_domain(copy(), uset.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::intersect_domain_union_set(isl::union_set uset) const +isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain_wrapped_range(isl::union_set uset) const { - auto res = isl_union_pw_qpolynomial_intersect_domain_union_set(copy(), uset.release()); + auto res = isl_union_pw_multi_aff_intersect_domain_wrapped_range(copy(), uset.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::intersect_domain_wrapped_domain(isl::union_set uset) const +isl::union_pw_multi_aff union_pw_multi_aff::intersect_params(isl::set set) const { - auto res = isl_union_pw_qpolynomial_intersect_domain_wrapped_domain(copy(), uset.release()); + auto res = isl_union_pw_multi_aff_intersect_params(copy(), set.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::intersect_domain_wrapped_range(isl::union_set uset) const +boolean union_pw_multi_aff::involves_locals() const { - auto res = isl_union_pw_qpolynomial_intersect_domain_wrapped_range(copy(), uset.release()); + auto res = isl_union_pw_multi_aff_involves_locals(get()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::intersect_params(isl::set set) const +boolean union_pw_multi_aff::isa_pw_multi_aff() const { - auto res = isl_union_pw_qpolynomial_intersect_params(copy(), set.release()); + auto res = isl_union_pw_multi_aff_isa_pw_multi_aff(get()); return manage(res); } -boolean union_pw_qpolynomial::involves_nan() const +boolean union_pw_multi_aff::plain_is_empty() const { - auto res = isl_union_pw_qpolynomial_involves_nan(get()); + auto res = isl_union_pw_multi_aff_plain_is_empty(get()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::mul(isl::union_pw_qpolynomial upwqp2) const +isl::union_pw_multi_aff union_pw_multi_aff::preimage_domain_wrapped_domain(isl::union_pw_multi_aff upma2) const { - auto res = isl_union_pw_qpolynomial_mul(copy(), upwqp2.release()); + auto res = isl_union_pw_multi_aff_preimage_domain_wrapped_domain_union_pw_multi_aff(copy(), upma2.release()); return manage(res); } -isl_size union_pw_qpolynomial::n_pw_qpolynomial() const -{ - auto res = isl_union_pw_qpolynomial_n_pw_qpolynomial(get()); - return res; -} - -isl::union_pw_qpolynomial union_pw_qpolynomial::neg() const +isl::union_pw_multi_aff union_pw_multi_aff::pullback(isl::union_pw_multi_aff upma2) const { - auto res = isl_union_pw_qpolynomial_neg(copy()); + auto res = isl_union_pw_multi_aff_pullback_union_pw_multi_aff(copy(), upma2.release()); return manage(res); } -boolean union_pw_qpolynomial::plain_is_equal(const isl::union_pw_qpolynomial &upwqp2) const +isl::pw_multi_aff_list union_pw_multi_aff::pw_multi_aff_list() const { - auto res = isl_union_pw_qpolynomial_plain_is_equal(get(), upwqp2.get()); + auto res = isl_union_pw_multi_aff_get_pw_multi_aff_list(get()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::reset_user() const +isl::pw_multi_aff_list union_pw_multi_aff::get_pw_multi_aff_list() const { - auto res = isl_union_pw_qpolynomial_reset_user(copy()); - return manage(res); + return pw_multi_aff_list(); } -isl::union_pw_qpolynomial union_pw_qpolynomial::scale_down_val(isl::val v) const +isl::union_pw_multi_aff union_pw_multi_aff::range_factor_domain() const { - auto res = isl_union_pw_qpolynomial_scale_down_val(copy(), v.release()); + auto res = isl_union_pw_multi_aff_range_factor_domain(copy()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::scale_val(isl::val v) const +isl::union_pw_multi_aff union_pw_multi_aff::range_factor_range() const { - auto res = isl_union_pw_qpolynomial_scale_val(copy(), v.release()); + auto res = isl_union_pw_multi_aff_range_factor_range(copy()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::sub(isl::union_pw_qpolynomial upwqp2) const +isl::union_pw_multi_aff union_pw_multi_aff::range_product(isl::union_pw_multi_aff upma2) const { - auto res = isl_union_pw_qpolynomial_sub(copy(), upwqp2.release()); + auto res = isl_union_pw_multi_aff_range_product(copy(), upma2.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::subtract_domain(isl::union_set uset) const +isl::space union_pw_multi_aff::space() const { - auto res = isl_union_pw_qpolynomial_subtract_domain(copy(), uset.release()); + auto res = isl_union_pw_multi_aff_get_space(get()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::subtract_domain_space(isl::space space) const +isl::space union_pw_multi_aff::get_space() const { - auto res = isl_union_pw_qpolynomial_subtract_domain_space(copy(), space.release()); - return manage(res); + return space(); } -isl::union_pw_qpolynomial union_pw_qpolynomial::subtract_domain_union_set(isl::union_set uset) const +isl::union_pw_multi_aff union_pw_multi_aff::sub(isl::union_pw_multi_aff upma2) const { - auto res = isl_union_pw_qpolynomial_subtract_domain_union_set(copy(), uset.release()); + auto res = isl_union_pw_multi_aff_sub(copy(), upma2.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::to_polynomial(int sign) const +isl::union_pw_multi_aff union_pw_multi_aff::subtract_domain(isl::space space) const { - auto res = isl_union_pw_qpolynomial_to_polynomial(copy(), sign); + auto res = isl_union_pw_multi_aff_subtract_domain_space(copy(), space.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::zero(isl::space space) +isl::union_pw_multi_aff union_pw_multi_aff::subtract_domain(isl::union_set uset) const { - auto res = isl_union_pw_qpolynomial_zero(space.release()); + auto res = isl_union_pw_multi_aff_subtract_domain_union_set(copy(), uset.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::zero_ctx(isl::ctx ctx) +isl::union_pw_multi_aff union_pw_multi_aff::union_add(isl::union_pw_multi_aff upma2) const { - auto res = isl_union_pw_qpolynomial_zero_ctx(ctx.release()); + auto res = isl_union_pw_multi_aff_union_add(copy(), upma2.release()); return manage(res); } -isl::union_pw_qpolynomial union_pw_qpolynomial::zero_space(isl::space space) +inline std::ostream &operator<<(std::ostream &os, const union_pw_multi_aff &obj) { - auto res = isl_union_pw_qpolynomial_zero_space(space.release()); - return manage(res); + char *str = isl_union_pw_multi_aff_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } // implementations for isl::union_set @@ -19648,7 +21885,6 @@ union_set::union_set(const union_set &obj) ptr = obj.copy(); } - union_set::union_set(__isl_take isl_union_set *ptr) : ptr(ptr) {} @@ -19657,16 +21893,19 @@ union_set::union_set(isl::basic_set bset) auto res = isl_union_set_from_basic_set(bset.release()); ptr = res; } + union_set::union_set(isl::point pnt) { auto res = isl_union_set_from_point(pnt.release()); ptr = res; } + union_set::union_set(isl::set set) { auto res = isl_union_set_from_set(set.release()); ptr = res; } + union_set::union_set(isl::ctx ctx, const std::string &str) { auto res = isl_union_set_read_from_str(ctx.release(), str.c_str()); @@ -19701,49 +21940,37 @@ bool union_set::is_null() const { return ptr == nullptr; } - isl::ctx union_set::ctx() const { return isl::ctx(isl_union_set_get_ctx(ptr)); } -void union_set::dump() const { - isl_union_set_dump(get()); -} - - isl::union_set union_set::affine_hull() const { auto res = isl_union_set_affine_hull(copy()); return manage(res); } -isl::union_set union_set::align_params(isl::space model) const -{ - auto res = isl_union_set_align_params(copy(), model.release()); - return manage(res); -} - isl::union_set union_set::apply(isl::union_map umap) const { auto res = isl_union_set_apply(copy(), umap.release()); return manage(res); } -isl::union_set union_set::coalesce() const +isl::set union_set::as_set() const { - auto res = isl_union_set_coalesce(copy()); + auto res = isl_union_set_as_set(copy()); return manage(res); } -isl::union_set union_set::coefficients() const +isl::union_set union_set::coalesce() const { - auto res = isl_union_set_coefficients(copy()); + auto res = isl_union_set_coalesce(copy()); return manage(res); } -isl::schedule union_set::compute_schedule(isl::union_map validity, isl::union_map proximity) const +isl::union_set union_set::compute_divs() const { - auto res = isl_union_set_compute_schedule(copy(), validity.release(), proximity.release()); + auto res = isl_union_set_compute_divs(copy()); return manage(res); } @@ -19759,15 +21986,23 @@ isl::union_set union_set::detect_equalities() const return manage(res); } -isl_size union_set::dim(isl::dim type) const +isl::union_set union_set::empty(isl::ctx ctx) { - auto res = isl_union_set_dim(get(), static_cast(type)); - return res; + auto res = isl_union_set_empty_ctx(ctx.release()); + return manage(res); } -isl::union_set union_set::empty(isl::ctx ctx) +boolean union_set::every_set(const std::function &test) const { - auto res = isl_union_set_empty_ctx(ctx.release()); + struct test_data { + std::function func; + } test_data = { test }; + auto test_lambda = [](isl_set *arg_0, void *arg_1) -> isl_bool { + auto *data = static_cast(arg_1); + auto ret = (data->func)(manage_copy(arg_0)); + return ret.release(); + }; + auto res = isl_union_set_every_set(get(), test_lambda, &test_data); return manage(res); } @@ -19777,58 +22012,34 @@ isl::set union_set::extract_set(isl::space space) const return manage(res); } -stat union_set::foreach_point(const std::function &fn) const +stat union_set::foreach_point(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; + std::function func; + } fn_data = { fn }; auto fn_lambda = [](isl_point *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); + auto ret = (data->func)(manage(arg_0)); return ret.release(); }; auto res = isl_union_set_foreach_point(get(), fn_lambda, &fn_data); return manage(res); } -stat union_set::foreach_set(const std::function &fn) const +stat union_set::foreach_set(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; + std::function func; + } fn_data = { fn }; auto fn_lambda = [](isl_set *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); + auto ret = (data->func)(manage(arg_0)); return ret.release(); }; auto res = isl_union_set_foreach_set(get(), fn_lambda, &fn_data); return manage(res); } -isl::basic_set_list union_set::get_basic_set_list() const -{ - auto res = isl_union_set_get_basic_set_list(get()); - return manage(res); -} - -uint32_t union_set::get_hash() const -{ - auto res = isl_union_set_get_hash(get()); - return res; -} - -isl::set_list union_set::get_set_list() const -{ - auto res = isl_union_set_get_set_list(get()); - return manage(res); -} - -isl::space union_set::get_space() const -{ - auto res = isl_union_set_get_space(get()); - return manage(res); -} - isl::union_set union_set::gist(isl::union_set context) const { auto res = isl_union_set_gist(copy(), context.release()); @@ -19883,12 +22094,6 @@ boolean union_set::is_equal(const isl::union_set &uset2) const return manage(res); } -boolean union_set::is_params() const -{ - auto res = isl_union_set_is_params(get()); - return manage(res); -} - boolean union_set::is_strict_subset(const isl::union_set &uset2) const { auto res = isl_union_set_is_strict_subset(get(), uset2.get()); @@ -19907,30 +22112,6 @@ boolean union_set::isa_set() const return manage(res); } -isl::union_map union_set::lex_ge_union_set(isl::union_set uset2) const -{ - auto res = isl_union_set_lex_ge_union_set(copy(), uset2.release()); - return manage(res); -} - -isl::union_map union_set::lex_gt_union_set(isl::union_set uset2) const -{ - auto res = isl_union_set_lex_gt_union_set(copy(), uset2.release()); - return manage(res); -} - -isl::union_map union_set::lex_le_union_set(isl::union_set uset2) const -{ - auto res = isl_union_set_lex_le_union_set(copy(), uset2.release()); - return manage(res); -} - -isl::union_map union_set::lex_lt_union_set(isl::union_set uset2) const -{ - auto res = isl_union_set_lex_lt_union_set(copy(), uset2.release()); - return manage(res); -} - isl::union_set union_set::lexmax() const { auto res = isl_union_set_lexmax(copy()); @@ -19943,18 +22124,6 @@ isl::union_set union_set::lexmin() const return manage(res); } -isl::multi_val union_set::min_multi_union_pw_aff(const isl::multi_union_pw_aff &obj) const -{ - auto res = isl_union_set_min_multi_union_pw_aff(get(), obj.get()); - return manage(res); -} - -isl_size union_set::n_set() const -{ - auto res = isl_union_set_n_set(get()); - return res; -} - isl::set union_set::params() const { auto res = isl_union_set_params(copy()); @@ -19985,69 +22154,43 @@ isl::union_set union_set::preimage(isl::union_pw_multi_aff upma) const return manage(res); } -isl::union_set union_set::product(isl::union_set uset2) const -{ - auto res = isl_union_set_product(copy(), uset2.release()); - return manage(res); -} - -isl::union_set union_set::project_out(isl::dim type, unsigned int first, unsigned int n) const -{ - auto res = isl_union_set_project_out(copy(), static_cast(type), first, n); - return manage(res); -} - -isl::union_set union_set::project_out_all_params() const -{ - auto res = isl_union_set_project_out_all_params(copy()); - return manage(res); -} - -isl::union_set union_set::remove_divs() const -{ - auto res = isl_union_set_remove_divs(copy()); - return manage(res); -} - -isl::union_set union_set::remove_redundancies() const +isl::point union_set::sample_point() const { - auto res = isl_union_set_remove_redundancies(copy()); + auto res = isl_union_set_sample_point(copy()); return manage(res); } -isl::union_set union_set::reset_user() const +isl::set_list union_set::set_list() const { - auto res = isl_union_set_reset_user(copy()); + auto res = isl_union_set_get_set_list(get()); return manage(res); } -isl::basic_set union_set::sample() const +isl::set_list union_set::get_set_list() const { - auto res = isl_union_set_sample(copy()); - return manage(res); + return set_list(); } -isl::point union_set::sample_point() const +isl::space union_set::space() const { - auto res = isl_union_set_sample_point(copy()); + auto res = isl_union_set_get_space(get()); return manage(res); } -isl::union_set union_set::simple_hull() const +isl::space union_set::get_space() const { - auto res = isl_union_set_simple_hull(copy()); - return manage(res); + return space(); } -isl::union_set union_set::solutions() const +isl::union_set union_set::subtract(isl::union_set uset2) const { - auto res = isl_union_set_solutions(copy()); + auto res = isl_union_set_subtract(copy(), uset2.release()); return manage(res); } -isl::union_set union_set::subtract(isl::union_set uset2) const +isl::union_set_list union_set::to_list() const { - auto res = isl_union_set_subtract(copy(), uset2.release()); + auto res = isl_union_set_to_list(copy()); return manage(res); } @@ -20069,10 +22212,16 @@ isl::union_map union_set::unwrap() const return manage(res); } -isl::union_map union_set::wrapped_domain_map() const +inline std::ostream &operator<<(std::ostream &os, const union_set &obj) { - auto res = isl_union_set_wrapped_domain_map(copy()); - return manage(res); + char *str = isl_union_set_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } // implementations for isl::union_set_list @@ -20093,10 +22242,26 @@ union_set_list::union_set_list(const union_set_list &obj) ptr = obj.copy(); } - union_set_list::union_set_list(__isl_take isl_union_set_list *ptr) : ptr(ptr) {} +union_set_list::union_set_list(isl::ctx ctx, int n) +{ + auto res = isl_union_set_list_alloc(ctx.release(), n); + ptr = res; +} + +union_set_list::union_set_list(isl::union_set el) +{ + auto res = isl_union_set_list_from_union_set(el.release()); + ptr = res; +} + +union_set_list::union_set_list(isl::ctx ctx, const std::string &str) +{ + auto res = isl_union_set_list_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} union_set_list &union_set_list::operator=(union_set_list obj) { std::swap(this->ptr, obj.ptr); @@ -20126,28 +22291,27 @@ bool union_set_list::is_null() const { return ptr == nullptr; } - isl::ctx union_set_list::ctx() const { return isl::ctx(isl_union_set_list_get_ctx(ptr)); } -void union_set_list::dump() const { - isl_union_set_list_dump(get()); -} - - isl::union_set_list union_set_list::add(isl::union_set el) const { auto res = isl_union_set_list_add(copy(), el.release()); return manage(res); } -isl::union_set_list union_set_list::alloc(isl::ctx ctx, int n) +isl::union_set union_set_list::at(int index) const { - auto res = isl_union_set_list_alloc(ctx.release(), n); + auto res = isl_union_set_list_get_at(get(), index); return manage(res); } +isl::union_set union_set_list::get_at(int index) const +{ + return at(index); +} + isl::union_set_list union_set_list::clear() const { auto res = isl_union_set_list_clear(copy()); @@ -20166,78 +22330,42 @@ isl::union_set_list union_set_list::drop(unsigned int first, unsigned int n) con return manage(res); } -stat union_set_list::foreach(const std::function &fn) const +stat union_set_list::foreach(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; + std::function func; + } fn_data = { fn }; auto fn_lambda = [](isl_union_set *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); + auto ret = (data->func)(manage(arg_0)); return ret.release(); }; auto res = isl_union_set_list_foreach(get(), fn_lambda, &fn_data); return manage(res); } -isl::union_set_list union_set_list::from_union_set(isl::union_set el) -{ - auto res = isl_union_set_list_from_union_set(el.release()); - return manage(res); -} - -isl::union_set union_set_list::get_at(int index) const -{ - auto res = isl_union_set_list_get_at(get(), index); - return manage(res); -} - -isl::union_set union_set_list::get_union_set(int index) const -{ - auto res = isl_union_set_list_get_union_set(get(), index); - return manage(res); -} - isl::union_set_list union_set_list::insert(unsigned int pos, isl::union_set el) const { auto res = isl_union_set_list_insert(copy(), pos, el.release()); return manage(res); } -isl_size union_set_list::n_union_set() const -{ - auto res = isl_union_set_list_n_union_set(get()); - return res; -} - -isl::union_set_list union_set_list::reverse() const -{ - auto res = isl_union_set_list_reverse(copy()); - return manage(res); -} - -isl::union_set_list union_set_list::set_union_set(int index, isl::union_set el) const -{ - auto res = isl_union_set_list_set_union_set(copy(), index, el.release()); - return manage(res); -} - -isl_size union_set_list::size() const +class size union_set_list::size() const { auto res = isl_union_set_list_size(get()); - return res; -} - -isl::union_set_list union_set_list::swap(unsigned int pos1, unsigned int pos2) const -{ - auto res = isl_union_set_list_swap(copy(), pos1, pos2); return manage(res); } -isl::union_set union_set_list::unite() const +inline std::ostream &operator<<(std::ostream &os, const union_set_list &obj) { - auto res = isl_union_set_list_union(copy()); - return manage(res); + char *str = isl_union_set_list_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } // implementations for isl::val @@ -20258,7 +22386,6 @@ val::val(const val &obj) ptr = obj.copy(); } - val::val(__isl_take isl_val *ptr) : ptr(ptr) {} @@ -20267,6 +22394,7 @@ val::val(isl::ctx ctx, long i) auto res = isl_val_int_from_si(ctx.release(), i); ptr = res; } + val::val(isl::ctx ctx, const std::string &str) { auto res = isl_val_read_from_str(ctx.release(), str.c_str()); @@ -20301,16 +22429,10 @@ bool val::is_null() const { return ptr == nullptr; } - isl::ctx val::ctx() const { return isl::ctx(isl_val_get_ctx(ptr)); } -void val::dump() const { - isl_val_dump(get()); -} - - isl::val val::abs() const { auto res = isl_val_abs(copy()); @@ -20323,16 +22445,20 @@ boolean val::abs_eq(const isl::val &v2) const return manage(res); } +boolean val::abs_eq(long v2) const +{ + return this->abs_eq(isl::val(ctx(), v2)); +} + isl::val val::add(isl::val v2) const { auto res = isl_val_add(copy(), v2.release()); return manage(res); } -isl::val val::add_ui(unsigned long v2) const +isl::val val::add(long v2) const { - auto res = isl_val_add_ui(copy(), v2); - return manage(res); + return this->add(isl::val(ctx(), v2)); } isl::val val::ceil() const @@ -20347,16 +22473,26 @@ int val::cmp_si(long i) const return res; } +long val::den_si() const +{ + auto res = isl_val_get_den_si(get()); + return res; +} + +long val::get_den_si() const +{ + return den_si(); +} + isl::val val::div(isl::val v2) const { auto res = isl_val_div(copy(), v2.release()); return manage(res); } -isl::val val::div_ui(unsigned long v2) const +isl::val val::div(long v2) const { - auto res = isl_val_div_ui(copy(), v2); - return manage(res); + return this->div(isl::val(ctx(), v2)); } boolean val::eq(const isl::val &v2) const @@ -20365,10 +22501,9 @@ boolean val::eq(const isl::val &v2) const return manage(res); } -boolean val::eq_si(long i) const +boolean val::eq(long v2) const { - auto res = isl_val_eq_si(get(), i); - return manage(res); + return this->eq(isl::val(ctx(), v2)); } isl::val val::floor() const @@ -20383,22 +22518,20 @@ isl::val val::gcd(isl::val v2) const return manage(res); } -boolean val::ge(const isl::val &v2) const +isl::val val::gcd(long v2) const { - auto res = isl_val_ge(get(), v2.get()); - return manage(res); + return this->gcd(isl::val(ctx(), v2)); } -uint32_t val::get_hash() const +boolean val::ge(const isl::val &v2) const { - auto res = isl_val_get_hash(get()); - return res; + auto res = isl_val_ge(get(), v2.get()); + return manage(res); } -long val::get_num_si() const +boolean val::ge(long v2) const { - auto res = isl_val_get_num_si(get()); - return res; + return this->ge(isl::val(ctx(), v2)); } boolean val::gt(const isl::val &v2) const @@ -20407,10 +22540,9 @@ boolean val::gt(const isl::val &v2) const return manage(res); } -boolean val::gt_si(long i) const +boolean val::gt(long v2) const { - auto res = isl_val_gt_si(get(), i); - return manage(res); + return this->gt(isl::val(ctx(), v2)); } isl::val val::infty(isl::ctx ctx) @@ -20437,6 +22569,11 @@ boolean val::is_divisible_by(const isl::val &v2) const return manage(res); } +boolean val::is_divisible_by(long v2) const +{ + return this->is_divisible_by(isl::val(ctx(), v2)); +} + boolean val::is_infty() const { auto res = isl_val_is_infty(get()); @@ -20515,46 +22652,64 @@ boolean val::le(const isl::val &v2) const return manage(res); } +boolean val::le(long v2) const +{ + return this->le(isl::val(ctx(), v2)); +} + boolean val::lt(const isl::val &v2) const { auto res = isl_val_lt(get(), v2.get()); return manage(res); } +boolean val::lt(long v2) const +{ + return this->lt(isl::val(ctx(), v2)); +} + isl::val val::max(isl::val v2) const { auto res = isl_val_max(copy(), v2.release()); return manage(res); } +isl::val val::max(long v2) const +{ + return this->max(isl::val(ctx(), v2)); +} + isl::val val::min(isl::val v2) const { auto res = isl_val_min(copy(), v2.release()); return manage(res); } +isl::val val::min(long v2) const +{ + return this->min(isl::val(ctx(), v2)); +} + isl::val val::mod(isl::val v2) const { auto res = isl_val_mod(copy(), v2.release()); return manage(res); } -isl::val val::mul(isl::val v2) const +isl::val val::mod(long v2) const { - auto res = isl_val_mul(copy(), v2.release()); - return manage(res); + return this->mod(isl::val(ctx(), v2)); } -isl::val val::mul_ui(unsigned long v2) const +isl::val val::mul(isl::val v2) const { - auto res = isl_val_mul_ui(copy(), v2); + auto res = isl_val_mul(copy(), v2.release()); return manage(res); } -isl_size val::n_abs_num_chunks(size_t size) const +isl::val val::mul(long v2) const { - auto res = isl_val_n_abs_num_chunks(get(), size); - return res; + return this->mul(isl::val(ctx(), v2)); } isl::val val::nan(isl::ctx ctx) @@ -20569,6 +22724,11 @@ boolean val::ne(const isl::val &v2) const return manage(res); } +boolean val::ne(long v2) const +{ + return this->ne(isl::val(ctx(), v2)); +} + isl::val val::neg() const { auto res = isl_val_neg(copy()); @@ -20587,6 +22747,17 @@ isl::val val::negone(isl::ctx ctx) return manage(res); } +long val::num_si() const +{ + auto res = isl_val_get_num_si(get()); + return res; +} + +long val::get_num_si() const +{ + return num_si(); +} + isl::val val::one(isl::ctx ctx) { auto res = isl_val_one(ctx.release()); @@ -20599,12 +22770,6 @@ isl::val val::pow2() const return manage(res); } -isl::val val::set_si(long i) const -{ - auto res = isl_val_set_si(copy(), i); - return manage(res); -} - int val::sgn() const { auto res = isl_val_sgn(get()); @@ -20617,12 +22782,23 @@ isl::val val::sub(isl::val v2) const return manage(res); } +isl::val val::sub(long v2) const +{ + return this->sub(isl::val(ctx(), v2)); +} + isl::val val::sub_ui(unsigned long v2) const { auto res = isl_val_sub_ui(copy(), v2); return manage(res); } +isl::val_list val::to_list() const +{ + auto res = isl_val_to_list(copy()); + return manage(res); +} + isl::val val::trunc() const { auto res = isl_val_trunc(copy()); @@ -20635,6 +22811,18 @@ isl::val val::zero(isl::ctx ctx) return manage(res); } +inline std::ostream &operator<<(std::ostream &os, const val &obj) +{ + char *str = isl_val_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; +} + // implementations for isl::val_list val_list manage(__isl_take isl_val_list *ptr) { return val_list(ptr); @@ -20653,10 +22841,26 @@ val_list::val_list(const val_list &obj) ptr = obj.copy(); } - val_list::val_list(__isl_take isl_val_list *ptr) : ptr(ptr) {} +val_list::val_list(isl::ctx ctx, int n) +{ + auto res = isl_val_list_alloc(ctx.release(), n); + ptr = res; +} + +val_list::val_list(isl::val el) +{ + auto res = isl_val_list_from_val(el.release()); + ptr = res; +} + +val_list::val_list(isl::ctx ctx, const std::string &str) +{ + auto res = isl_val_list_read_from_str(ctx.release(), str.c_str()); + ptr = res; +} val_list &val_list::operator=(val_list obj) { std::swap(this->ptr, obj.ptr); @@ -20686,28 +22890,32 @@ bool val_list::is_null() const { return ptr == nullptr; } - isl::ctx val_list::ctx() const { return isl::ctx(isl_val_list_get_ctx(ptr)); } -void val_list::dump() const { - isl_val_list_dump(get()); -} - - isl::val_list val_list::add(isl::val el) const { auto res = isl_val_list_add(copy(), el.release()); return manage(res); } -isl::val_list val_list::alloc(isl::ctx ctx, int n) +isl::val_list val_list::add(long el) const { - auto res = isl_val_list_alloc(ctx.release(), n); + return this->add(isl::val(ctx(), el)); +} + +isl::val val_list::at(int index) const +{ + auto res = isl_val_list_get_at(get(), index); return manage(res); } +isl::val val_list::get_at(int index) const +{ + return at(index); +} + isl::val_list val_list::clear() const { auto res = isl_val_list_clear(copy()); @@ -20726,279 +22934,48 @@ isl::val_list val_list::drop(unsigned int first, unsigned int n) const return manage(res); } -stat val_list::foreach(const std::function &fn) const +stat val_list::foreach(const std::function &fn) const { struct fn_data { - const std::function *func; - } fn_data = { &fn }; + std::function func; + } fn_data = { fn }; auto fn_lambda = [](isl_val *arg_0, void *arg_1) -> isl_stat { auto *data = static_cast(arg_1); - stat ret = (*data->func)(manage(arg_0)); + auto ret = (data->func)(manage(arg_0)); return ret.release(); }; auto res = isl_val_list_foreach(get(), fn_lambda, &fn_data); return manage(res); } -isl::val_list val_list::from_val(isl::val el) -{ - auto res = isl_val_list_from_val(el.release()); - return manage(res); -} - -isl::val val_list::get_at(int index) const -{ - auto res = isl_val_list_get_at(get(), index); - return manage(res); -} - -isl::val val_list::get_val(int index) const -{ - auto res = isl_val_list_get_val(get(), index); - return manage(res); -} - isl::val_list val_list::insert(unsigned int pos, isl::val el) const { auto res = isl_val_list_insert(copy(), pos, el.release()); return manage(res); } -isl_size val_list::n_val() const -{ - auto res = isl_val_list_n_val(get()); - return res; -} - -isl::val_list val_list::reverse() const -{ - auto res = isl_val_list_reverse(copy()); - return manage(res); -} - -isl::val_list val_list::set_val(int index, isl::val el) const +isl::val_list val_list::insert(unsigned int pos, long el) const { - auto res = isl_val_list_set_val(copy(), index, el.release()); - return manage(res); + return this->insert(pos, isl::val(ctx(), el)); } -isl_size val_list::size() const +class size val_list::size() const { auto res = isl_val_list_size(get()); - return res; -} - -isl::val_list val_list::swap(unsigned int pos1, unsigned int pos2) const -{ - auto res = isl_val_list_swap(copy(), pos1, pos2); - return manage(res); -} - -// implementations for isl::vec -vec manage(__isl_take isl_vec *ptr) { - return vec(ptr); -} -vec manage_copy(__isl_keep isl_vec *ptr) { - ptr = isl_vec_copy(ptr); - return vec(ptr); -} - -vec::vec() - : ptr(nullptr) {} - -vec::vec(const vec &obj) - : ptr(nullptr) -{ - ptr = obj.copy(); -} - - -vec::vec(__isl_take isl_vec *ptr) - : ptr(ptr) {} - - -vec &vec::operator=(vec obj) { - std::swap(this->ptr, obj.ptr); - return *this; -} - -vec::~vec() { - if (ptr) - isl_vec_free(ptr); -} - -__isl_give isl_vec *vec::copy() const & { - return isl_vec_copy(ptr); -} - -__isl_keep isl_vec *vec::get() const { - return ptr; -} - -__isl_give isl_vec *vec::release() { - isl_vec *tmp = ptr; - ptr = nullptr; - return tmp; -} - -bool vec::is_null() const { - return ptr == nullptr; -} - - -isl::ctx vec::ctx() const { - return isl::ctx(isl_vec_get_ctx(ptr)); -} - -void vec::dump() const { - isl_vec_dump(get()); -} - - -isl::vec vec::add(isl::vec vec2) const -{ - auto res = isl_vec_add(copy(), vec2.release()); - return manage(res); -} - -isl::vec vec::add_els(unsigned int n) const -{ - auto res = isl_vec_add_els(copy(), n); - return manage(res); -} - -isl::vec vec::alloc(isl::ctx ctx, unsigned int size) -{ - auto res = isl_vec_alloc(ctx.release(), size); - return manage(res); -} - -isl::vec vec::ceil() const -{ - auto res = isl_vec_ceil(copy()); - return manage(res); -} - -isl::vec vec::clr() const -{ - auto res = isl_vec_clr(copy()); - return manage(res); -} - -int vec::cmp_element(const isl::vec &vec2, int pos) const -{ - auto res = isl_vec_cmp_element(get(), vec2.get(), pos); - return res; -} - -isl::vec vec::concat(isl::vec vec2) const -{ - auto res = isl_vec_concat(copy(), vec2.release()); - return manage(res); -} - -isl::vec vec::drop_els(unsigned int pos, unsigned int n) const -{ - auto res = isl_vec_drop_els(copy(), pos, n); - return manage(res); -} - -isl::vec vec::extend(unsigned int size) const -{ - auto res = isl_vec_extend(copy(), size); - return manage(res); -} - -isl::val vec::get_element_val(int pos) const -{ - auto res = isl_vec_get_element_val(get(), pos); - return manage(res); -} - -isl::vec vec::insert_els(unsigned int pos, unsigned int n) const -{ - auto res = isl_vec_insert_els(copy(), pos, n); - return manage(res); -} - -isl::vec vec::insert_zero_els(unsigned int pos, unsigned int n) const -{ - auto res = isl_vec_insert_zero_els(copy(), pos, n); - return manage(res); -} - -boolean vec::is_equal(const isl::vec &vec2) const -{ - auto res = isl_vec_is_equal(get(), vec2.get()); - return manage(res); -} - -isl::vec vec::mat_product(isl::mat mat) const -{ - auto res = isl_vec_mat_product(copy(), mat.release()); - return manage(res); -} - -isl::vec vec::move_els(unsigned int dst_col, unsigned int src_col, unsigned int n) const -{ - auto res = isl_vec_move_els(copy(), dst_col, src_col, n); - return manage(res); -} - -isl::vec vec::neg() const -{ - auto res = isl_vec_neg(copy()); - return manage(res); -} - -isl::vec vec::set_element_si(int pos, int v) const -{ - auto res = isl_vec_set_element_si(copy(), pos, v); - return manage(res); -} - -isl::vec vec::set_element_val(int pos, isl::val v) const -{ - auto res = isl_vec_set_element_val(copy(), pos, v.release()); - return manage(res); -} - -isl::vec vec::set_si(int v) const -{ - auto res = isl_vec_set_si(copy(), v); - return manage(res); -} - -isl::vec vec::set_val(isl::val v) const -{ - auto res = isl_vec_set_val(copy(), v.release()); - return manage(res); -} - -isl_size vec::size() const -{ - auto res = isl_vec_size(get()); - return res; -} - -isl::vec vec::sort() const -{ - auto res = isl_vec_sort(copy()); - return manage(res); -} - -isl::vec vec::zero(isl::ctx ctx, unsigned int size) -{ - auto res = isl_vec_zero(ctx.release(), size); return manage(res); } -isl::vec vec::zero_extend(unsigned int size) const +inline std::ostream &operator<<(std::ostream &os, const val_list &obj) { - auto res = isl_vec_zero_extend(copy(), size); - return manage(res); + char *str = isl_val_list_to_str(obj.get()); + if (!str) { + os.setstate(std::ios_base::badbit); + return os; + } + os << str; + free(str); + return os; } -} // namespace noexceptions } // namespace isl #endif /* ISL_CPP_CHECKED */ diff --git a/polly/lib/Support/GICHelper.cpp b/polly/lib/Support/GICHelper.cpp index d1da965d1b03c..409dbf4766dd9 100644 --- a/polly/lib/Support/GICHelper.cpp +++ b/polly/lib/Support/GICHelper.cpp @@ -199,49 +199,43 @@ std::string polly::getIslCompatibleName(const std::string &Prefix, /// not know that it is never called, and therefore must ensure the existence of /// the dump functions. void neverCalled() { - isl::aff().dump(); - isl::aff_list().dump(); - isl::ast_expr().dump(); - isl::ast_expr_list().dump(); - isl::ast_node().dump(); - isl::ast_node_list().dump(); - isl::basic_map().dump(); - isl::basic_map_list().dump(); - isl::basic_set().dump(); - isl::basic_set_list().dump(); - isl::constraint().dump(); - isl::constraint_list().dump(); - isl::id().dump(); - isl::id_list().dump(); - isl::id_to_ast_expr().dump(); - isl::local_space().dump(); - isl::map().dump(); - isl::map_list().dump(); - isl::multi_aff().dump(); - isl::multi_pw_aff().dump(); - isl::multi_union_pw_aff().dump(); - isl::multi_val().dump(); - isl::point().dump(); - isl::pw_aff().dump(); - isl::pw_aff_list().dump(); - isl::pw_multi_aff().dump(); - isl::pw_qpolynomial().dump(); - isl::qpolynomial().dump(); - isl::schedule().dump(); - isl::schedule_constraints().dump(); - isl::schedule_node().dump(); - isl::set().dump(); - isl::set_list().dump(); - isl::space().dump(); - isl::union_map().dump(); - isl::union_map_list().dump(); - isl::union_pw_aff().dump(); - isl::union_pw_aff_list().dump(); - isl::union_pw_multi_aff().dump(); - isl::union_pw_multi_aff_list().dump(); - isl::union_set().dump(); - isl::union_set_list().dump(); - isl::val().dump(); - isl::val_list().dump(); + polly::dumpIslObj(isl::aff()); + polly::dumpIslObj(isl::aff_list()); + polly::dumpIslObj(isl::ast_expr()); + polly::dumpIslObj(isl::ast_node()); + polly::dumpIslObj(isl::ast_node_list()); + polly::dumpIslObj(isl::basic_map()); + polly::dumpIslObj(isl::basic_map_list()); + polly::dumpIslObj(isl::basic_set()); + polly::dumpIslObj(isl::basic_set_list()); + polly::dumpIslObj(isl::constraint()); + polly::dumpIslObj(isl::id()); + polly::dumpIslObj(isl::id_list()); + polly::dumpIslObj(isl::id_to_ast_expr()); + polly::dumpIslObj(isl::local_space()); + polly::dumpIslObj(isl::map()); + polly::dumpIslObj(isl::map_list()); + polly::dumpIslObj(isl::multi_aff()); + polly::dumpIslObj(isl::multi_pw_aff()); + polly::dumpIslObj(isl::multi_union_pw_aff()); + polly::dumpIslObj(isl::multi_val()); + polly::dumpIslObj(isl::point()); + polly::dumpIslObj(isl::pw_aff()); + polly::dumpIslObj(isl::pw_aff_list()); + polly::dumpIslObj(isl::pw_multi_aff()); + polly::dumpIslObj(isl::schedule()); + polly::dumpIslObj(isl::schedule_constraints()); + polly::dumpIslObj(isl::schedule_node()); + polly::dumpIslObj(isl::set()); + polly::dumpIslObj(isl::set_list()); + polly::dumpIslObj(isl::space()); + polly::dumpIslObj(isl::union_map()); + polly::dumpIslObj(isl::union_pw_aff()); + polly::dumpIslObj(isl::union_pw_aff_list()); + polly::dumpIslObj(isl::union_pw_multi_aff()); + polly::dumpIslObj(isl::union_set()); + polly::dumpIslObj(isl::union_set_list()); + polly::dumpIslObj(isl::val()); + polly::dumpIslObj(isl::val_list()); } #endif diff --git a/polly/lib/Support/ISLTools.cpp b/polly/lib/Support/ISLTools.cpp index 0e23d5ef5ecf0..1afd07e504dd6 100644 --- a/polly/lib/Support/ISLTools.cpp +++ b/polly/lib/Support/ISLTools.cpp @@ -36,7 +36,7 @@ isl::multi_aff makeShiftDimAff(isl::space Space, int Pos, int Amount) { auto Identity = isl::multi_aff::identity(Space); if (Amount == 0) return Identity; - auto ShiftAff = Identity.get_aff(Pos); + auto ShiftAff = Identity.at(Pos); ShiftAff = ShiftAff.set_constant_si(Amount); return Identity.set_aff(Pos, ShiftAff); } @@ -56,8 +56,8 @@ isl::basic_map makeTupleSwapBasicMap(isl::space FromSpace1, assert(FromSpace1.is_set()); assert(FromSpace2.is_set()); - unsigned Dims1 = FromSpace1.dim(isl::dim::set); - unsigned Dims2 = FromSpace2.dim(isl::dim::set); + unsigned Dims1 = FromSpace1.dim(isl::dim::set).release(); + unsigned Dims2 = FromSpace2.dim(isl::dim::set).release(); isl::space FromSpace = FromSpace1.map_from_domain_and_range(FromSpace2).wrap(); @@ -166,7 +166,7 @@ isl_size polly::getNumScatterDims(const isl::union_map &Schedule) { if (Map.is_null()) continue; - Dims = std::max(Dims, Map.range_tuple_dim()); + Dims = std::max(Dims, Map.range_tuple_dim().release()); } return Dims; } @@ -214,7 +214,7 @@ isl::union_map polly::reverseDomain(const isl::union_map &UMap) { } isl::set polly::shiftDim(isl::set Set, int Pos, int Amount) { - int NumDims = Set.tuple_dim(); + int NumDims = Set.tuple_dim().release(); if (Pos < 0) Pos = NumDims + Pos; assert(Pos < NumDims && "Dimension index must be in range"); @@ -235,7 +235,7 @@ isl::union_set polly::shiftDim(isl::union_set USet, int Pos, int Amount) { } isl::map polly::shiftDim(isl::map Map, isl::dim Dim, int Pos, int Amount) { - int NumDims = Map.dim(Dim); + int NumDims = Map.dim(Dim).release(); if (Pos < 0) Pos = NumDims + Pos; assert(Pos < NumDims && "Dimension index must be in range"); @@ -449,16 +449,16 @@ isl::map polly::distributeDomain(isl::map Map) { isl::space DomainSpace = Space.domain(); if (DomainSpace.is_null()) return {}; - unsigned DomainDims = DomainSpace.dim(isl::dim::set); + unsigned DomainDims = DomainSpace.dim(isl::dim::set).release(); isl::space RangeSpace = Space.range().unwrap(); isl::space Range1Space = RangeSpace.domain(); if (Range1Space.is_null()) return {}; - unsigned Range1Dims = Range1Space.dim(isl::dim::set); + unsigned Range1Dims = Range1Space.dim(isl::dim::set).release(); isl::space Range2Space = RangeSpace.range(); if (Range2Space.is_null()) return {}; - unsigned Range2Dims = Range2Space.dim(isl::dim::set); + unsigned Range2Dims = Range2Space.dim(isl::dim::set).release(); isl::space OutputSpace = DomainSpace.map_from_domain_and_range(Range1Space) @@ -606,17 +606,17 @@ static int flatCompare(const isl::basic_set &A, const isl::basic_set &B) { if (A.is_null() || B.is_null()) return 0; - unsigned ALen = A.dim(isl::dim::set); - unsigned BLen = B.dim(isl::dim::set); + unsigned ALen = A.dim(isl::dim::set).release(); + unsigned BLen = B.dim(isl::dim::set).release(); unsigned Len = std::min(ALen, BLen); for (unsigned i = 0; i < Len; i += 1) { isl::basic_set ADim = - A.project_out(isl::dim::param, 0, A.dim(isl::dim::param)) + A.project_out(isl::dim::param, 0, A.dim(isl::dim::param).release()) .project_out(isl::dim::set, i + 1, ALen - i - 1) .project_out(isl::dim::set, 0, i); isl::basic_set BDim = - B.project_out(isl::dim::param, 0, B.dim(isl::dim::param)) + B.project_out(isl::dim::param, 0, B.dim(isl::dim::param).release()) .project_out(isl::dim::set, i + 1, BLen - i - 1) .project_out(isl::dim::set, 0, i); @@ -687,7 +687,8 @@ static int structureCompare(const isl::space &ASpace, const isl::space &BSpace, return NameCompare; if (ConsiderTupleLen) { - int LenCompare = BSpace.dim(isl::dim::set) - ASpace.dim(isl::dim::set); + int LenCompare = BSpace.dim(isl::dim::set).release() - + ASpace.dim(isl::dim::set).release(); if (LenCompare != 0) return LenCompare; } @@ -782,14 +783,14 @@ static void printSortedPolyhedra(isl::union_set USet, llvm::raw_ostream &OS, } static void recursiveExpand(isl::basic_set BSet, int Dim, isl::set &Expanded) { - int Dims = BSet.dim(isl::dim::set); + int Dims = BSet.dim(isl::dim::set).release(); if (Dim >= Dims) { Expanded = Expanded.unite(BSet); return; } isl::basic_set DimOnly = - BSet.project_out(isl::dim::param, 0, BSet.dim(isl::dim::param)) + BSet.project_out(isl::dim::param, 0, BSet.dim(isl::dim::param).release()) .project_out(isl::dim::set, Dim + 1, Dims - Dim - 1) .project_out(isl::dim::set, 0, Dim); if (!DimOnly.is_bounded()) { diff --git a/polly/lib/Transform/FlattenAlgo.cpp b/polly/lib/Transform/FlattenAlgo.cpp index 6edece553a207..d9efe3fbfa844 100644 --- a/polly/lib/Transform/FlattenAlgo.cpp +++ b/polly/lib/Transform/FlattenAlgo.cpp @@ -26,10 +26,10 @@ namespace { /// i.e. there are two constants Min and Max, such that every value x of the /// chosen dimensions is Min <= x <= Max. bool isDimBoundedByConstant(isl::set Set, unsigned dim) { - auto ParamDims = Set.dim(isl::dim::param); + auto ParamDims = Set.dim(isl::dim::param).release(); Set = Set.project_out(isl::dim::param, 0, ParamDims); Set = Set.project_out(isl::dim::set, 0, dim); - auto SetDims = Set.tuple_dim(); + auto SetDims = Set.tuple_dim().release(); Set = Set.project_out(isl::dim::set, 1, SetDims - 1); return bool(Set.is_bounded()); } @@ -40,7 +40,7 @@ bool isDimBoundedByConstant(isl::set Set, unsigned dim) { /// Min_p <= x <= Max_p. bool isDimBoundedByParameter(isl::set Set, unsigned dim) { Set = Set.project_out(isl::dim::set, 0, dim); - auto SetDims = Set.tuple_dim(); + auto SetDims = Set.tuple_dim().release(); Set = Set.project_out(isl::dim::set, 1, SetDims - 1); return bool(Set.is_bounded()); } @@ -135,7 +135,7 @@ isl_size scheduleScatterDims(const isl::union_map &Schedule) { if (Map.is_null()) continue; - Dims = std::max(Dims, Map.range_tuple_dim()); + Dims = std::max(Dims, Map.range_tuple_dim().release()); } return Dims; } @@ -144,7 +144,7 @@ isl_size scheduleScatterDims(const isl::union_map &Schedule) { isl::union_pw_aff scheduleExtractDimAff(isl::union_map UMap, unsigned pos) { auto SingleUMap = isl::union_map::empty(UMap.ctx()); for (isl::map Map : UMap.get_map_list()) { - unsigned MapDims = Map.range_tuple_dim(); + unsigned MapDims = Map.range_tuple_dim().release(); isl::map SingleMap = Map.project_out(isl::dim::out, 0, pos); SingleMap = SingleMap.project_out(isl::dim::out, 1, MapDims - pos - 1); SingleUMap = SingleUMap.unite(SingleMap); @@ -152,7 +152,7 @@ isl::union_pw_aff scheduleExtractDimAff(isl::union_map UMap, unsigned pos) { auto UAff = isl::union_pw_multi_aff(SingleUMap); auto FirstMAff = isl::multi_union_pw_aff(UAff); - return FirstMAff.get_union_pw_aff(0); + return FirstMAff.at(0); } /// Flatten a sequence-like first dimension. @@ -179,7 +179,7 @@ isl::union_map tryFlattenSequence(isl::union_map Schedule) { auto ScatterSet = isl::set(Schedule.range()); auto ParamSpace = Schedule.get_space().params(); - auto Dims = ScatterSet.tuple_dim(); + auto Dims = ScatterSet.tuple_dim().release(); assert(Dims >= 2); // Would cause an infinite loop. @@ -238,8 +238,10 @@ isl::union_map tryFlattenSequence(isl::union_map Schedule) { auto FirstScheduleAffWithOffset = FirstScheduleAffNormalized.add(AllCounter); - auto ScheduleWithOffset = isl::union_map(FirstScheduleAffWithOffset) - .flat_range_product(RemainingSubSchedule); + auto ScheduleWithOffset = + isl::union_map::from( + isl::union_pw_multi_aff(FirstScheduleAffWithOffset)) + .flat_range_product(RemainingSubSchedule); NewSchedule = NewSchedule.unite(ScheduleWithOffset); ScatterSet = ScatterSet.subtract(ScatterFirst); @@ -269,7 +271,7 @@ isl::union_map tryFlattenLoop(isl::union_map Schedule) { auto SubDims = scheduleScatterDims(SubSchedule); auto SubExtent = isl::set(SubSchedule.range()); - auto SubExtentDims = SubExtent.dim(isl::dim::param); + auto SubExtentDims = SubExtent.dim(isl::dim::param).release(); SubExtent = SubExtent.project_out(isl::dim::param, 0, SubExtentDims); SubExtent = SubExtent.project_out(isl::dim::set, 1, SubDims - 1); @@ -294,15 +296,15 @@ isl::union_map tryFlattenLoop(isl::union_map Schedule) { auto FirstSubScheduleAff = scheduleExtractDimAff(SubSchedule, 0); auto RemainingSubSchedule = scheduleProjectOut(std::move(SubSchedule), 0, 1); - auto LenVal = MaxVal.sub(MinVal).add_ui(1); + auto LenVal = MaxVal.sub(MinVal).add(1); auto FirstSubScheduleNormalized = subtract(FirstSubScheduleAff, MinVal); // TODO: Normalize FirstAff to zero (convert to isl_map, determine minimum, // subtract it) auto FirstAff = scheduleExtractDimAff(Schedule, 0); auto Offset = multiply(FirstAff, LenVal); - auto Index = FirstSubScheduleNormalized.add(Offset); - auto IndexMap = isl::union_map(Index); + isl::union_pw_multi_aff Index = FirstSubScheduleNormalized.add(Offset); + auto IndexMap = isl::union_map::from(Index); auto Result = IndexMap.flat_range_product(RemainingSubSchedule); LLVM_DEBUG(dbgs() << "Loop-flatten result is:\n " << Result << "\n"); diff --git a/polly/lib/Transform/MatmulOptimizer.cpp b/polly/lib/Transform/MatmulOptimizer.cpp index 7e0a837be9ed5..3845e9bb8903d 100644 --- a/polly/lib/Transform/MatmulOptimizer.cpp +++ b/polly/lib/Transform/MatmulOptimizer.cpp @@ -188,8 +188,8 @@ static isl::union_set getUnrollIsolatedSetOptions(isl::ctx Ctx) { /// @return The modified map. static isl::map permuteDimensions(isl::map Map, isl::dim DimType, unsigned DstPos, unsigned SrcPos) { - assert((isl_size)DstPos < Map.dim(DimType) && - (isl_size)SrcPos < Map.dim(DimType)); + assert((isl_size)DstPos < Map.dim(DimType).release() && + (isl_size)SrcPos < Map.dim(DimType).release()); if (DstPos == SrcPos) return Map; isl::id DimId; @@ -229,7 +229,7 @@ static bool isMatMulOperandAcc(isl::set Domain, isl::map AccMap, int &FirstPos, isl::space Space = AccMap.get_space(); isl::map Universe = isl::map::universe(Space); - if (Space.dim(isl::dim::out) != 2) + if (Space.dim(isl::dim::out).release() != 2) return false; // MatMul has the form: @@ -317,7 +317,7 @@ static bool containsOnlyMatrMultAcc(isl::map PartialSchedule, MatMulInfoTy &MMI) { auto InputDimId = PartialSchedule.get_tuple_id(isl::dim::in); auto *Stmt = static_cast(InputDimId.get_user()); - isl_size OutDimNum = PartialSchedule.range_tuple_dim(); + isl_size OutDimNum = PartialSchedule.range_tuple_dim().release(); assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest " "and, consequently, the corresponding scheduling " "functions have at least three dimensions."); @@ -363,7 +363,7 @@ static bool containsOnlyMatMulDep(isl::map Schedule, const Dependences *D, auto DomainSpace = Schedule.get_space().domain(); auto Space = DomainSpace.map_from_domain_and_range(DomainSpace); auto Deltas = Dep.extract_map(Space).deltas(); - isl_size DeltasDimNum = Deltas.dim(isl::dim::set); + isl_size DeltasDimNum = Deltas.dim(isl::dim::set).release(); for (int i = 0; i < DeltasDimNum; i++) { auto Val = Deltas.plain_get_val_if_fixed(isl::dim::set, i); Pos = Pos < 0 && Val.is_one() ? i : Pos; @@ -445,8 +445,8 @@ static isl::schedule_node permuteBandNodeDimensions(isl::schedule_node Node, std::max(FirstDim, SecondDim)); auto PartialSchedule = isl::manage(isl_schedule_node_band_get_partial_schedule(Node.get())); - auto PartialScheduleFirstDim = PartialSchedule.get_union_pw_aff(FirstDim); - auto PartialScheduleSecondDim = PartialSchedule.get_union_pw_aff(SecondDim); + auto PartialScheduleFirstDim = PartialSchedule.at(FirstDim); + auto PartialScheduleSecondDim = PartialSchedule.at(SecondDim); PartialSchedule = PartialSchedule.set_union_pw_aff(SecondDim, PartialScheduleFirstDim); PartialSchedule = @@ -492,7 +492,7 @@ createMacroKernel(isl::schedule_node Node, Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1); // Mark the outermost loop as parallelizable. - Node = Node.band_member_set_coincident(0, true); + Node = Node.as().member_set_coincident(0, true); return Node.child(0).child(0); } @@ -729,7 +729,7 @@ static isl::schedule_node optimizePackedB(isl::schedule_node Node, // Insert into the schedule tree. isl::map ExtMap = MapOldIndVar.project_out( - isl::dim::out, 2, MapOldIndVar.range_tuple_dim() - 2); + isl::dim::out, 2, MapOldIndVar.range_tuple_dim().release() - 2); ExtMap = ExtMap.reverse(); ExtMap = ExtMap.fix_si(isl::dim::out, MMI.i, 0); ExtMap = ExtMap.intersect_range(Domain); @@ -870,9 +870,9 @@ getInductionVariablesSubstitution(isl::schedule_node Node, auto Child = Node.child(0); auto UnMapOldIndVar = Child.get_prefix_schedule_union_map(); auto MapOldIndVar = isl::map::from_union_map(UnMapOldIndVar); - if (MapOldIndVar.range_tuple_dim() > 9) - return MapOldIndVar.project_out(isl::dim::out, 0, - MapOldIndVar.range_tuple_dim() - 9); + if (MapOldIndVar.range_tuple_dim().release() > 9) + return MapOldIndVar.project_out( + isl::dim::out, 0, MapOldIndVar.range_tuple_dim().release() - 9); return MapOldIndVar; } @@ -893,10 +893,10 @@ getInductionVariablesSubstitution(isl::schedule_node Node, static isl::schedule_node isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node, struct MicroKernelParamsTy MicroKernelParams) { - isl::schedule_node Child = Node.get_child(0); + isl::schedule_node Child = Node.child(0); isl::union_map UnMapOldIndVar = Child.get_prefix_schedule_relation(); isl::set Prefix = isl::map::from_union_map(UnMapOldIndVar).range(); - isl_size Dims = Prefix.tuple_dim(); + isl_size Dims = Prefix.tuple_dim().release(); Prefix = Prefix.project_out(isl::dim::set, Dims - 1, 1); Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Nr); Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Mr); @@ -906,11 +906,11 @@ isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node, isl::ctx Ctx = Node.ctx(); auto Options = IsolateOption.unite(getDimOptions(Ctx, "unroll")); Options = Options.unite(getUnrollIsolatedSetOptions(Ctx)); - Node = Node.band_set_ast_build_options(Options); + Node = Node.as().set_ast_build_options(Options); Node = Node.parent().parent().parent(); IsolateOption = getIsolateOptions(Prefix, 3); Options = IsolateOption.unite(getDimOptions(Ctx, "separate")); - Node = Node.band_set_ast_build_options(Options); + Node = Node.as().set_ast_build_options(Options); Node = Node.child(0).child(0).child(0); return Node; } @@ -953,8 +953,8 @@ getBandNodeWithOriginDimOrder(isl::schedule_node Node) { return Node; auto Domain = Node.get_universe_domain(); assert(isl_union_set_n_set(Domain.get()) == 1); - if (Node.get_schedule_depth() != 0 || - (isl::set(Domain).tuple_dim() != + if (Node.get_schedule_depth().release() != 0 || + (isl::set(Domain).tuple_dim().release() != isl_schedule_node_band_n_member(Node.get()))) return Node; Node = isl::manage(isl_schedule_node_delete(Node.copy())); @@ -1029,7 +1029,7 @@ static bool isMatrMultPattern(isl::schedule_node Node, const Dependences *D, Node = Node.parent(); if (LeafType != isl_schedule_node_leaf || isl_schedule_node_band_n_member(Node.get()) < 3 || - Node.get_schedule_depth() != 0 || + Node.get_schedule_depth().release() != 0 || isl_union_map_n_map(PartialSchedule.get()) != 1) return false; auto NewPartialSchedule = isl::map::from_union_map(PartialSchedule); diff --git a/polly/lib/Transform/MaximalStaticExpansion.cpp b/polly/lib/Transform/MaximalStaticExpansion.cpp index a28e39bcd5625..4a1665633f092 100644 --- a/polly/lib/Transform/MaximalStaticExpansion.cpp +++ b/polly/lib/Transform/MaximalStaticExpansion.cpp @@ -118,10 +118,10 @@ class MaximalStaticExpander : public ScopPass { /// i.e. there are two constants Min and Max, such that every value x of the /// chosen dimensions is Min <= x <= Max. static bool isDimBoundedByConstant(isl::set Set, unsigned dim) { - auto ParamDims = Set.dim(isl::dim::param); + auto ParamDims = Set.dim(isl::dim::param).release(); Set = Set.project_out(isl::dim::param, 0, ParamDims); Set = Set.project_out(isl::dim::set, 0, dim); - auto SetDims = Set.tuple_dim(); + auto SetDims = Set.tuple_dim().release(); Set = Set.project_out(isl::dim::set, 1, SetDims - 1); return bool(Set.is_bounded()); } @@ -350,7 +350,7 @@ ScopArrayInfo *MaximalStaticExpander::expandAccess(Scop &S, MemoryAccess *MA) { // Get the current AM. auto CurrentAccessMap = MA->getAccessRelation(); - unsigned in_dimensions = CurrentAccessMap.domain_tuple_dim(); + unsigned in_dimensions = CurrentAccessMap.domain_tuple_dim().release(); // Get domain from the current AM. auto Domain = CurrentAccessMap.domain(); @@ -405,7 +405,7 @@ ScopArrayInfo *MaximalStaticExpander::expandAccess(Scop &S, MemoryAccess *MA) { // Add constraints to linked output with input id. auto SpaceMap = NewAccessMap.get_space(); auto ConstraintBasicMap = - isl::basic_map::equal(SpaceMap, SpaceMap.dim(isl::dim::in)); + isl::basic_map::equal(SpaceMap, SpaceMap.dim(isl::dim::in).release()); NewAccessMap = isl::map(ConstraintBasicMap); // Set the new access relation map. diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp index 9d344a1526976..3d26a04cada0d 100644 --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -366,8 +366,9 @@ ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node, isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1); Node = Node.parent().parent(); isl::union_set Options = IsolateOption.unite(AtomicOption); - Node = Node.band_set_ast_build_options(Options); - return Node; + isl::schedule_node_band Result = + Node.as().set_ast_build_options(Options); + return Result; } isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( @@ -375,7 +376,7 @@ isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); - isl_size ScheduleDimensions = Space.dim(isl::dim::set); + isl_size ScheduleDimensions = Space.dim(isl::dim::set).release(); assert((isl_size)DimToVectorize < ScheduleDimensions); if (DimToVectorize > 0) { @@ -394,9 +395,10 @@ isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( Node = Node.child(0); // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise, // we will have troubles to match it in the backend. - Node = Node.band_set_ast_build_options( - isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }")); - Node = isl::manage(isl_schedule_node_band_sink(Node.release())); + isl::schedule_node_band NodeBand = + Node.as().set_ast_build_options( + isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }")); + Node = isl::manage(isl_schedule_node_band_sink(NodeBand.release())); Node = Node.child(0); if (isl_schedule_node_get_type(Node.get()) == isl_schedule_node_leaf) Node = Node.parent(); @@ -442,7 +444,7 @@ bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) { return false; auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); - auto Dims = Space.dim(isl::dim::set); + auto Dims = Space.dim(isl::dim::set).release(); if (Dims <= 1) return false; @@ -474,10 +476,10 @@ ScheduleTreeOptimizer::standardBandOpts(isl::schedule_node Node, void *User) { return Node; auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); - auto Dims = Space.dim(isl::dim::set); + auto Dims = Space.dim(isl::dim::set).release(); for (int i = Dims - 1; i >= 0; i--) - if (Node.band_member_get_coincident(i)) { + if (Node.as().member_get_coincident(i)) { Node = prevectSchedBand(Node, i, PrevectorWidth); break; } @@ -615,7 +617,7 @@ static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) { int CountMembers = isl_schedule_node_band_n_member(Node.get()); NumBandMembers[Version] += CountMembers; for (int i = 0; i < CountMembers; i += 1) { - if (Node.band_member_get_coincident(i)) + if (Node.as().member_get_coincident(i)) NumCoincident[Version]++; } break; diff --git a/polly/lib/Transform/ScheduleTreeTransform.cpp b/polly/lib/Transform/ScheduleTreeTransform.cpp index c0006a5b0f13f..3ce005e3c4b22 100644 --- a/polly/lib/Transform/ScheduleTreeTransform.cpp +++ b/polly/lib/Transform/ScheduleTreeTransform.cpp @@ -80,7 +80,7 @@ struct ScheduleTreeRewriter isl::schedule NewChild = getDerived().visit(Band.child(0), std::forward(args)...); isl::schedule_node NewNode = - NewChild.insert_partial_schedule(PartialSched).get_root().get_child(0); + NewChild.insert_partial_schedule(PartialSched).get_root().child(0); // Reapply permutability and coincidence attributes. NewNode = isl::manage(isl_schedule_node_band_set_permutable( @@ -123,7 +123,8 @@ struct ScheduleTreeRewriter } isl::schedule visitMark(const isl::schedule_node &Mark, Args... args) { - isl::id TheMark = Mark.mark_get_id(); + + isl::id TheMark = Mark.as().get_id(); isl::schedule_node NewChild = getDerived() .visit(Mark.first_child(), std::forward(args)...) @@ -134,7 +135,8 @@ struct ScheduleTreeRewriter isl::schedule visitExtension(const isl::schedule_node &Extension, Args... args) { - isl::union_map TheExtension = Extension.extension_get_extension(); + isl::union_map TheExtension = + Extension.as().get_extension(); isl::schedule_node NewChild = getDerived() .visit(Extension.child(0), args...) .get_root() @@ -145,7 +147,8 @@ struct ScheduleTreeRewriter } isl::schedule visitFilter(const isl::schedule_node &Filter, Args... args) { - isl::union_set FilterDomain = Filter.filter_get_filter(); + isl::union_set FilterDomain = + Filter.as().get_filter(); isl::schedule NewSchedule = getDerived().visit(Filter.child(0), std::forward(args)...); return NewSchedule.intersect_domain(FilterDomain); @@ -236,7 +239,7 @@ struct ExtensionNodeRewriter isl::union_map NewPartialSchedMap = isl::union_map::from(PartialSched); unsigned BandDims = isl_schedule_node_band_n_member(OldNode.get()); for (isl::map Ext : NewChildExtensions.get_map_list()) { - unsigned ExtDims = Ext.domain_tuple_dim(); + unsigned ExtDims = Ext.domain_tuple_dim().release(); assert(ExtDims >= BandDims); unsigned OuterDims = ExtDims - BandDims; @@ -256,7 +259,7 @@ struct ExtensionNodeRewriter isl::schedule_node NewNode = NewChild.insert_partial_schedule(NewPartialSchedAsAsMultiUnionPwAff) .get_root() - .get_child(0); + .child(0); // Reapply permutability and coincidence attributes. NewNode = isl::manage(isl_schedule_node_band_set_permutable( @@ -274,7 +277,8 @@ struct ExtensionNodeRewriter isl::schedule visitFilter(const isl::schedule_node &Filter, const isl::union_set &Domain, isl::union_map &Extensions) { - isl::union_set FilterDomain = Filter.filter_get_filter(); + isl::union_set FilterDomain = + Filter.as().get_filter(); isl::union_set NewDomain = Domain.intersect(FilterDomain); // A filter is added implicitly if necessary when joining schedule trees. @@ -284,7 +288,8 @@ struct ExtensionNodeRewriter isl::schedule visitExtension(const isl::schedule_node &Extension, const isl::union_set &Domain, isl::union_map &Extensions) { - isl::union_map ExtDomain = Extension.extension_get_extension(); + isl::union_map ExtDomain = + Extension.as().get_extension(); isl::union_set NewDomain = Domain.unite(ExtDomain.range()); isl::union_map ChildExtensions; isl::schedule NewChild = @@ -340,7 +345,8 @@ struct ApplyASTBuildOptions isl::schedule_node visitBand(const isl::schedule_node &Band) { isl::schedule_node Result = - Band.band_set_ast_build_options(ASTBuildOptions[Pos]); + Band.as().set_ast_build_options( + ASTBuildOptions[Pos]); Pos += 1; return getBase().visitBand(Result); } @@ -412,7 +418,7 @@ static isl::id createGeneratedLoopAttr(isl::ctx Ctx, MDNode *FollowupLoopMD) { /// start with either the mark or the band. static isl::schedule_node moveToBandMark(isl::schedule_node BandOrMark) { if (isBandMark(BandOrMark)) { - assert(isBandWithSingleLoop(BandOrMark.get_child(0))); + assert(isBandWithSingleLoop(BandOrMark.child(0))); return BandOrMark; } assert(isBandWithSingleLoop(BandOrMark)); @@ -431,7 +437,7 @@ static isl::schedule_node removeMark(isl::schedule_node MarkOrBand, isl::schedule_node Band; if (isMark(MarkOrBand)) { - Attr = getLoopAttr(MarkOrBand.mark_get_id()); + Attr = getLoopAttr(MarkOrBand.as().get_id()); Band = isl::manage(isl_schedule_node_delete(MarkOrBand.release())); } else { Attr = nullptr; @@ -453,7 +459,7 @@ static isl::schedule_node insertMark(isl::schedule_node Band, isl::id Mark) { assert(moveToBandMark(Band).is_equal(Band) && "Don't add a two marks for a band"); - return Band.insert_mark(Mark).get_child(0); + return Band.insert_mark(Mark).child(0); } /// Return the (one-dimensional) set of numbers that are divisible by @p Factor @@ -484,7 +490,7 @@ static isl::basic_set isDivisibleBySet(isl::ctx &Ctx, long Factor, /// @param Set A set, which should be modified. /// @param VectorWidth A parameter, which determines the constraint. static isl::set addExtentConstraints(isl::set Set, int VectorWidth) { - unsigned Dims = Set.tuple_dim(); + unsigned Dims = Set.tuple_dim().release(); isl::space Space = Set.get_space(); isl::local_space LocalSpace = isl::local_space(Space); isl::constraint ExtConstr = isl::constraint::alloc_inequality(LocalSpace); @@ -499,7 +505,8 @@ static isl::set addExtentConstraints(isl::set Set, int VectorWidth) { } // namespace bool polly::isBandMark(const isl::schedule_node &Node) { - return isMark(Node) && isLoopAttr(Node.mark_get_id()); + return isMark(Node) && + isLoopAttr(Node.as().get_id()); } BandAttr *polly::getBandAttr(isl::schedule_node MarkOrBand) { @@ -507,7 +514,7 @@ BandAttr *polly::getBandAttr(isl::schedule_node MarkOrBand) { if (!isMark(MarkOrBand)) return nullptr; - return getLoopAttr(MarkOrBand.mark_get_id()); + return getLoopAttr(MarkOrBand.as().get_id()); } isl::schedule polly::hoistExtensionNodes(isl::schedule Sched) { @@ -543,13 +550,14 @@ isl::schedule polly::applyFullUnroll(isl::schedule_node BandToUnroll) { isl::multi_union_pw_aff PartialSched = isl::manage( isl_schedule_node_band_get_partial_schedule(BandToUnroll.get())); - assert(PartialSched.dim(isl::dim::out) == 1 && + assert(PartialSched.dim(isl::dim::out).release() == 1 && "Can only unroll a single dimension"); - isl::union_pw_aff PartialSchedUAff = PartialSched.get_union_pw_aff(0); + isl::union_pw_aff PartialSchedUAff = PartialSched.at(0); isl::union_set Domain = BandToUnroll.get_domain(); PartialSchedUAff = PartialSchedUAff.intersect_domain(Domain); - isl::union_map PartialSchedUMap = isl::union_map(PartialSchedUAff); + isl::union_map PartialSchedUMap = + isl::union_map::from(isl::union_pw_multi_aff(PartialSchedUAff)); // Enumerator only the scatter elements. isl::union_set ScatterList = PartialSchedUMap.range(); @@ -570,7 +578,7 @@ isl::schedule polly::applyFullUnroll(isl::schedule_node BandToUnroll) { }); // Convert the points to a sequence of filters. - isl::union_set_list List = isl::union_set_list::alloc(Ctx, Elts.size()); + isl::union_set_list List = isl::union_set_list(Ctx, Elts.size()); for (isl::point P : Elts) { // Determine the domains that map this scatter element. isl::union_set DomainFilter = PartialSchedUMap.intersect_range(P).domain(); @@ -599,7 +607,7 @@ isl::schedule polly::applyPartialUnroll(isl::schedule_node BandToUnroll, isl_schedule_node_band_get_partial_schedule(BandToUnroll.get())); // { Stmt[] -> [x] } - isl::union_pw_aff PartialSchedUAff = PartialSched.get_union_pw_aff(0); + isl::union_pw_aff PartialSchedUAff = PartialSched.at(0); // Here we assume the schedule stride is one and starts with 0, which is not // necessarily the case. @@ -616,10 +624,11 @@ isl::schedule polly::applyPartialUnroll(isl::schedule_node BandToUnroll, return isl::stat::ok(); }); - isl::union_set_list List = isl::union_set_list::alloc(Ctx, Factor); + isl::union_set_list List = isl::union_set_list(Ctx, Factor); for (auto i : seq(0, Factor)) { // { Stmt[] -> [x] } - isl::union_map UMap{PartialSchedUAff}; + isl::union_map UMap = + isl::union_map::from(isl::union_pw_multi_aff(PartialSchedUAff)); // { [x] } isl::basic_set Divisible = isDivisibleBySet(Ctx, Factor, i); @@ -650,7 +659,7 @@ isl::schedule polly::applyPartialUnroll(isl::schedule_node BandToUnroll, isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange, int VectorWidth) { - isl_size Dims = ScheduleRange.tuple_dim(); + isl_size Dims = ScheduleRange.tuple_dim().release(); isl::set LoopPrefixes = ScheduleRange.drop_constraints_involving_dims(isl::dim::set, Dims - 1, 1); auto ExtentPrefixes = addExtentConstraints(LoopPrefixes, VectorWidth); @@ -662,7 +671,7 @@ isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange, isl::union_set polly::getIsolateOptions(isl::set IsolateDomain, isl_size OutDimsNum) { - isl_size Dims = IsolateDomain.tuple_dim(); + isl_size Dims = IsolateDomain.tuple_dim().release(); assert(OutDimsNum <= Dims && "The isl::set IsolateDomain is used to describe the range of schedule " "dimensions values, which should be isolated. Consequently, the " @@ -693,7 +702,7 @@ isl::schedule_node polly::tileNode(isl::schedule_node Node, auto Dims = Space.dim(isl::dim::set); auto Sizes = isl::multi_val::zero(Space); std::string IdentifierString(Identifier); - for (auto i : seq(0, Dims)) { + for (auto i : seq(0, Dims.release())) { auto tileSize = i < (isl_size)TileSizes.size() ? TileSizes[i] : DefaultTileSize; Sizes = Sizes.set_val(i, isl::val(Node.ctx(), tileSize)); @@ -717,5 +726,6 @@ isl::schedule_node polly::applyRegisterTiling(isl::schedule_node Node, int DefaultTileSize) { Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize); auto Ctx = Node.ctx(); - return Node.band_set_ast_build_options(isl::union_set(Ctx, "{unroll[x]}")); + return Node.as().set_ast_build_options( + isl::union_set(Ctx, "{unroll[x]}")); } diff --git a/polly/lib/Transform/Simplify.cpp b/polly/lib/Transform/Simplify.cpp index e5fd53289f1c5..d839289bdcb67 100644 --- a/polly/lib/Transform/Simplify.cpp +++ b/polly/lib/Transform/Simplify.cpp @@ -101,7 +101,7 @@ static isl::union_map underapproximatedAddMap(isl::union_map UMap, isl::map Result = isl::map::empty(PrevMap.get_space()); for (isl::basic_map BMap : PrevMap.get_basic_map_list()) { - if (Result.n_basic_map() > SimplifyMaxDisjuncts) + if (Result.n_basic_map().release() > SimplifyMaxDisjuncts) break; Result = Result.unite(BMap); } diff --git a/polly/lib/Transform/ZoneAlgo.cpp b/polly/lib/Transform/ZoneAlgo.cpp index ae5354c7fb9f2..1aec6708ef220 100644 --- a/polly/lib/Transform/ZoneAlgo.cpp +++ b/polly/lib/Transform/ZoneAlgo.cpp @@ -246,7 +246,8 @@ static isl::map makeUnknownForDomain(isl::set Domain) { static bool isMapToUnknown(const isl::map &Map) { isl::space Space = Map.get_space().range(); return Space.has_tuple_id(isl::dim::set).is_false() && - Space.is_wrapping().is_false() && Space.dim(isl::dim::set) == 0; + Space.is_wrapping().is_false() && + Space.dim(isl::dim::set).release() == 0; } isl::union_map polly::filterKnownValInst(const isl::union_map &UMap) { @@ -685,10 +686,12 @@ isl::map ZoneAlgorithm::getDefToTarget(ScopStmt *DefStmt, TargetStmt->getSurroundingLoop())) { isl::set DefDomain = getDomainFor(DefStmt); isl::set TargetDomain = getDomainFor(TargetStmt); - assert(DefDomain.tuple_dim() <= TargetDomain.tuple_dim()); + assert(DefDomain.tuple_dim().release() <= + TargetDomain.tuple_dim().release()); Result = isl::map::from_domain_and_range(DefDomain, TargetDomain); - for (unsigned i = 0, DefDims = DefDomain.tuple_dim(); i < DefDims; i += 1) + for (unsigned i = 0, DefDims = DefDomain.tuple_dim().release(); i < DefDims; + i += 1) Result = Result.equate(isl::dim::in, i, isl::dim::out, i); } diff --git a/polly/unittests/Support/ISLTools.cpp b/polly/unittests/Support/ISLTools.cpp index 2a796439cfe4d..35225eb4ff373 100644 --- a/polly/unittests/Support/ISLTools.cpp +++ b/polly/unittests/Support/ISLTools.cpp @@ -19,7 +19,7 @@ TEST(Support, isl_iterator) { Ctx, "{ [x, y] : 0 <= x <= 5 and y >= 0 and x <= 4 and y <= 3 + x }"); isl::set S = A.unite(B); - ASSERT_EQ(S.n_basic_set(), 2); + ASSERT_EQ(S.n_basic_set().release(), 2); std::vector Sets; for (auto BS : S.get_basic_set_list()) Sets.push_back(BS); From 54a61c94f932f894f2695e0c18bb288d2d1407b7 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Mon, 16 Aug 2021 14:46:52 +0100 Subject: [PATCH 110/700] [DebugInfo][InstrRef] Honour too-much-debug-info cutouts VarLoc based LiveDebugValues will abandon variable location propagation if there are too many blocks and variable assignments in the function. If it didn't, and we had (say) 1000 blocks and 1000 variables in scope, we'd end up with 1 million DBG_VALUEs just at the start of blocks. Instruction-referencing LiveDebugValues should honour this limitation too (because the same limitation applies to it). Hoist the relevant command line options into LiveDebugValues.cpp and pass it down into the implementation classes as an argument to ExtendRanges. I've duplicated all the run-lines in live-debug-values-cutoffs.mir to have an instruction-referencing flavour. Differential Revision: https://reviews.llvm.org/D107823 --- .../LiveDebugValues/InstrRefBasedImpl.cpp | 21 ++++++++++++++++--- .../LiveDebugValues/LiveDebugValues.cpp | 14 ++++++++++++- .../CodeGen/LiveDebugValues/LiveDebugValues.h | 4 +++- .../LiveDebugValues/VarLocBasedImpl.cpp | 21 ++++++------------- .../MIR/X86/live-debug-values-cutoffs.mir | 20 ++++++++++++++++++ 5 files changed, 60 insertions(+), 20 deletions(-) diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index dc99070583406..a653c0a994e93 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -1684,7 +1684,8 @@ class InstrRefBasedLDV : public LDVImpl { /// RPOT block ordering. void initialSetup(MachineFunction &MF); - bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) override; + bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, + unsigned InputBBLimit, unsigned InputDbgValLimit) override; public: /// Default construct and initialize the pass. @@ -3523,8 +3524,9 @@ void InstrRefBasedLDV::initialSetup(MachineFunction &MF) { /// Calculate the liveness information for the given machine function and /// extend ranges across basic blocks. -bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, - TargetPassConfig *TPC) { +bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, + unsigned InputBBLimit, + unsigned InputDbgValLimit) { // No subprogram means this function contains no debuginfo. if (!MF.getFunction().getSubprogram()) return false; @@ -3626,6 +3628,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, // To mirror old LiveDebugValues, enumerate variables in RPOT order. Otherwise // the order is unimportant, it just has to be stable. + unsigned VarAssignCount = 0; for (unsigned int I = 0; I < OrderToBB.size(); ++I) { auto *MBB = OrderToBB[I]; auto *VTracker = &vlocs[MBB->getNumber()]; @@ -3643,9 +3646,21 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, ScopeToVars[Scope].insert(Var); ScopeToBlocks[Scope].insert(VTracker->MBB); ScopeToDILocation[Scope] = ScopeLoc; + ++VarAssignCount; } } + // If we have an extremely large number of variable assignments and blocks, + // bail out at this point. We've burnt some time doing analysis already, + // however we should cut our losses. + if (MaxNumBlocks > InputBBLimit && VarAssignCount > InputDbgValLimit) { + LLVM_DEBUG(dbgs() << "Disabling InstrRefBasedLDV: " << MF.getName() + << " has " << MaxNumBlocks << " basic blocks and " + << VarAssignCount + << " variable assignments, exceeding limits.\n"); + return false; + } + // OK. Iterate over scopes: there might be something to be said for // ordering them by size/locality, but that's for the future. For each scope, // solve the variable value problem, producing a map of variables to values diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp index 38e803d1abb55..bc1eaff60440f 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp @@ -40,6 +40,18 @@ static cl::opt "normal DBG_VALUE inputs"), cl::init(false)); +// Options to prevent pathological compile-time behavior. If InputBBLimit and +// InputDbgValueLimit are both exceeded, range extension is disabled. +static cl::opt InputBBLimit( + "livedebugvalues-input-bb-limit", + cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"), + cl::init(10000), cl::Hidden); +static cl::opt InputDbgValueLimit( + "livedebugvalues-input-dbg-value-limit", + cl::desc( + "Maximum input DBG_VALUE insts supported by debug range extension"), + cl::init(50000), cl::Hidden); + /// Generic LiveDebugValues pass. Calls through to VarLocBasedLDV or /// InstrRefBasedLDV to perform location propagation, via the LDVImpl /// base class. @@ -103,5 +115,5 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) { TheImpl = llvm::makeVarLocBasedLiveDebugValues(); } - return TheImpl->ExtendRanges(MF, TPC); + return TheImpl->ExtendRanges(MF, TPC, InputBBLimit, InputDbgValueLimit); } diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h index 9c910f180b9fb..e38360b08bafa 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h +++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h @@ -23,7 +23,9 @@ inline namespace SharedLiveDebugValues { // implementation. class LDVImpl { public: - virtual bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) = 0; + virtual bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, + unsigned InputBBLimit, + unsigned InputDbgValLimit) = 0; virtual ~LDVImpl() {} }; diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp index 1e6d65c189535..977d3ede5c776 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp @@ -166,18 +166,6 @@ using namespace llvm; STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted"); -// Options to prevent pathological compile-time behavior. If InputBBLimit and -// InputDbgValueLimit are both exceeded, range extension is disabled. -static cl::opt InputBBLimit( - "livedebugvalues-input-bb-limit", - cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"), - cl::init(10000), cl::Hidden); -static cl::opt InputDbgValueLimit( - "livedebugvalues-input-dbg-value-limit", - cl::desc( - "Maximum input DBG_VALUE insts supported by debug range extension"), - cl::init(50000), cl::Hidden); - /// If \p Op is a stack or frame register return true, otherwise return false. /// This is used to avoid basing the debug entry values on the registers, since /// we do not support it at the moment. @@ -1007,7 +995,8 @@ class VarLocBasedLDV : public LDVImpl { /// had their instruction creation deferred. void flushPendingLocs(VarLocInMBB &PendingInLocs, VarLocMap &VarLocIDs); - bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) override; + bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, + unsigned InputBBLimit, unsigned InputDbgValLimit) override; public: /// Default construct and initialize the pass. @@ -2048,7 +2037,9 @@ void VarLocBasedLDV::recordEntryValue(const MachineInstr &MI, /// Calculate the liveness information for the given machine function and /// extend ranges across basic blocks. -bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) { +bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, + unsigned InputBBLimit, + unsigned InputDbgValLimit) { LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n"); if (!MF.getFunction().getSubprogram()) @@ -2141,7 +2132,7 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) { for (auto &MI : MBB) if (MI.isDebugValue()) ++NumInputDbgValues; - if (NumInputDbgValues > InputDbgValueLimit) { + if (NumInputDbgValues > InputDbgValLimit) { LLVM_DEBUG(dbgs() << "Disabling VarLocBasedLDV: " << MF.getName() << " has " << RPONumber << " basic blocks and " << NumInputDbgValues diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir index 4922c36086f16..17b6b9b3149c3 100644 --- a/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir +++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir @@ -5,21 +5,41 @@ # RUN: -livedebugvalues-input-bb-limit=1 \ # RUN: -livedebugvalues-input-dbg-value-limit=1 \ # RUN: | FileCheck %s -check-prefix=LDV-DISABLED +# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ +# RUN: -experimental-debug-variable-locations \ +# RUN: -livedebugvalues-input-bb-limit=1 \ +# RUN: -livedebugvalues-input-dbg-value-limit=1 \ +# RUN: | FileCheck %s -check-prefix=LDV-DISABLED # RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ # RUN: -livedebugvalues-input-bb-limit=1 \ # RUN: -livedebugvalues-input-dbg-value-limit=10 \ # RUN: | FileCheck %s -check-prefix=LDV-ENABLED +# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ +# RUN: -experimental-debug-variable-locations \ +# RUN: -livedebugvalues-input-bb-limit=1 \ +# RUN: -livedebugvalues-input-dbg-value-limit=10 \ +# RUN: | FileCheck %s -check-prefix=LDV-ENABLED # RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ # RUN: -livedebugvalues-input-bb-limit=10 \ # RUN: -livedebugvalues-input-dbg-value-limit=1 \ # RUN: | FileCheck %s -check-prefix=LDV-ENABLED +# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ +# RUN: -experimental-debug-variable-locations \ +# RUN: -livedebugvalues-input-bb-limit=10 \ +# RUN: -livedebugvalues-input-dbg-value-limit=1 \ +# RUN: | FileCheck %s -check-prefix=LDV-ENABLED # RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ # RUN: -livedebugvalues-input-bb-limit=10 \ # RUN: -livedebugvalues-input-dbg-value-limit=10 \ # RUN: | FileCheck %s -check-prefix=LDV-ENABLED +# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ +# RUN: -experimental-debug-variable-locations \ +# RUN: -livedebugvalues-input-bb-limit=10 \ +# RUN: -livedebugvalues-input-dbg-value-limit=10 \ +# RUN: | FileCheck %s -check-prefix=LDV-ENABLED # LDV-DISABLED-LABEL: bb.1.exit # LDV-DISABLED-NEXT: $edi = MOV32rm From 2d3668c997faac1f64cd3b8eb336af989069d135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krist=C3=B3f=20Umann?= Date: Mon, 5 Jul 2021 15:11:46 +0200 Subject: [PATCH 111/700] [analyzer] MallocChecker: Add a visitor to leave a note on functions that could have, but did not change ownership on leaked memory This is a rather common feedback we get from out leak checkers: bug reports are really short, and are contain barely any usable information on what the analyzer did to conclude that a leak actually happened. This happens because of our bug report minimizing effort. We construct bug reports by inspecting the ExplodedNodes that lead to the error from the bottom up (from the error node all the way to the root of the exploded graph), and mark entities that were the cause of a bug, or have interacted with it as interesting. In order to make the bug report a bit less verbose, whenever we find an entire function call (from CallEnter to CallExitEnd) that didn't talk about any interesting entity, we prune it (click here for more info on bug report generation). Even if the event to highlight is exactly this lack of interaction with interesting entities. D105553 generalized the visitor that creates notes for these cases. This patch adds a new kind of NoStateChangeVisitor that leaves notes in functions that took a piece of dynamically allocated memory that later leaked as parameter, and didn't change its ownership status. Differential Revision: https://reviews.llvm.org/D105553 --- .../clang/StaticAnalyzer/Checkers/Checkers.td | 12 +- .../StaticAnalyzer/Checkers/MallocChecker.cpp | 147 +++++++++++++++++- clang/test/Analysis/NewDeleteLeaks.cpp | 142 +++++++++++++++++ clang/test/Analysis/analyzer-config.c | 1 + 4 files changed, 300 insertions(+), 2 deletions(-) create mode 100644 clang/test/Analysis/NewDeleteLeaks.cpp diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td index 444b00d73f0b7..125ef859d1ebb 100644 --- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td +++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td @@ -485,7 +485,17 @@ def DynamicMemoryModeling: Checker<"DynamicMemoryModeling">, "allocating and deallocating functions are annotated with " "ownership_holds, ownership_takes and ownership_returns.", "false", - InAlpha> + InAlpha>, + CmdLineOption ]>, Dependencies<[CStringModeling]>, Documentation, diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp index a6470da09c458..7db4066653cbd 100644 --- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp @@ -48,6 +48,7 @@ #include "InterCheckerAPI.h" #include "clang/AST/Attr.h" #include "clang/AST/DeclCXX.h" +#include "clang/AST/DeclTemplate.h" #include "clang/AST/Expr.h" #include "clang/AST/ExprCXX.h" #include "clang/AST/ParentMap.h" @@ -64,12 +65,15 @@ #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerHelpers.h" #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h" +#include "clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h" #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h" #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h" #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState_Fwd.h" #include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h" +#include "clang/StaticAnalyzer/Core/PathSensitive/StoreRef.h" #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Compiler.h" @@ -298,6 +302,8 @@ class MallocChecker /// which might free a pointer are annotated. DefaultBool ShouldIncludeOwnershipAnnotatedFunctions; + DefaultBool ShouldRegisterNoOwnershipChangeVisitor; + /// Many checkers are essentially built into this one, so enabling them will /// make MallocChecker perform additional modeling and reporting. enum CheckKind { @@ -722,11 +728,146 @@ class MallocChecker bool isArgZERO_SIZE_PTR(ProgramStateRef State, CheckerContext &C, SVal ArgVal) const; }; +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// Definition of NoOwnershipChangeVisitor. +//===----------------------------------------------------------------------===// + +namespace { +class NoOwnershipChangeVisitor final : public NoStateChangeFuncVisitor { + SymbolRef Sym; + using OwnerSet = llvm::SmallPtrSet; + + // Collect which entities point to the allocated memory, and could be + // responsible for deallocating it. + class OwnershipBindingsHandler : public StoreManager::BindingsHandler { + SymbolRef Sym; + OwnerSet &Owners; + + public: + OwnershipBindingsHandler(SymbolRef Sym, OwnerSet &Owners) + : Sym(Sym), Owners(Owners) {} + + bool HandleBinding(StoreManager &SMgr, Store Store, const MemRegion *Region, + SVal Val) override { + if (Val.getAsSymbol() == Sym) + Owners.insert(Region); + return true; + } + }; + +protected: + OwnerSet getOwnersAtNode(const ExplodedNode *N) { + OwnerSet Ret; + + ProgramStateRef State = N->getState(); + OwnershipBindingsHandler Handler{Sym, Ret}; + State->getStateManager().getStoreManager().iterBindings(State->getStore(), + Handler); + return Ret; + } + + static const ExplodedNode *getCallExitEnd(const ExplodedNode *N) { + while (N && !N->getLocationAs()) + N = N->getFirstSucc(); + return N; + } + + virtual bool + wasModifiedBeforeCallExit(const ExplodedNode *CurrN, + const ExplodedNode *CallExitN) override { + if (CurrN->getLocationAs()) + return true; + + // Its the state right *after* the call that is interesting. Any pointers + // inside the call that pointed to the allocated memory are of little + // consequence if their lifetime ends within the function. + CallExitN = getCallExitEnd(CallExitN); + if (!CallExitN) + return true; + + if (CurrN->getState()->get(Sym) != + CallExitN->getState()->get(Sym)) + return true; + + OwnerSet CurrOwners = getOwnersAtNode(CurrN); + OwnerSet ExitOwners = getOwnersAtNode(CallExitN); + + // Owners in the current set may be purged from the analyzer later on. + // If a variable is dead (is not referenced directly or indirectly after + // some point), it will be removed from the Store before the end of its + // actual lifetime. + // This means that that if the ownership status didn't change, CurrOwners + // must be a superset of, but not necessarily equal to ExitOwners. + return !llvm::set_is_subset(ExitOwners, CurrOwners); + } + + static PathDiagnosticPieceRef emitNote(const ExplodedNode *N) { + PathDiagnosticLocation L = PathDiagnosticLocation::create( + N->getLocation(), + N->getState()->getStateManager().getContext().getSourceManager()); + return std::make_shared( + L, "Returning without deallocating memory or storing the pointer for " + "later deallocation"); + } + + virtual PathDiagnosticPieceRef + maybeEmitNoteForObjCSelf(PathSensitiveBugReport &R, + const ObjCMethodCall &Call, + const ExplodedNode *N) override { + // TODO: Implement. + return nullptr; + } + + virtual PathDiagnosticPieceRef + maybeEmitNoteForCXXThis(PathSensitiveBugReport &R, + const CXXConstructorCall &Call, + const ExplodedNode *N) override { + // TODO: Implement. + return nullptr; + } + + virtual PathDiagnosticPieceRef + maybeEmitNoteForParameters(PathSensitiveBugReport &R, const CallEvent &Call, + const ExplodedNode *N) override { + // TODO: Factor the logic of "what constitutes as an entity being passed + // into a function call" out by reusing the code in + // NoStoreFuncVisitor::maybeEmitNoteForParameters, maybe by incorporating + // the printing technology in UninitializedObject's FieldChainInfo. + ArrayRef Parameters = Call.parameters(); + for (unsigned I = 0; I < Call.getNumArgs() && I < Parameters.size(); ++I) { + SVal V = Call.getArgSVal(I); + if (V.getAsSymbol() == Sym) + return emitNote(N); + } + return nullptr; + } + +public: + NoOwnershipChangeVisitor(SymbolRef Sym) + : NoStateChangeFuncVisitor(bugreporter::TrackingKind::Thorough), + Sym(Sym) {} + + void Profile(llvm::FoldingSetNodeID &ID) const override { + static int Tag = 0; + ID.AddPointer(&Tag); + ID.AddPointer(Sym); + } + + void *getTag() const { + static int Tag = 0; + return static_cast(&Tag); + } +}; + +} // end anonymous namespace //===----------------------------------------------------------------------===// // Definition of MallocBugVisitor. //===----------------------------------------------------------------------===// +namespace { /// The bug visitor which allows us to print extra diagnostics along the /// BugReport path. For example, showing the allocation site of the leaked /// region. @@ -851,7 +992,6 @@ class MallocBugVisitor final : public BugReporterVisitor { } }; }; - } // end anonymous namespace // A map from the freed symbol to the symbol representing the return value of @@ -2579,6 +2719,8 @@ void MallocChecker::HandleLeak(SymbolRef Sym, ExplodedNode *N, AllocNode->getLocationContext()->getDecl()); R->markInteresting(Sym); R->addVisitor(Sym, true); + if (ShouldRegisterNoOwnershipChangeVisitor) + R->addVisitor(Sym); C.emitReport(std::move(R)); } @@ -3395,6 +3537,9 @@ void ento::registerDynamicMemoryModeling(CheckerManager &mgr) { auto *checker = mgr.registerChecker(); checker->ShouldIncludeOwnershipAnnotatedFunctions = mgr.getAnalyzerOptions().getCheckerBooleanOption(checker, "Optimistic"); + checker->ShouldRegisterNoOwnershipChangeVisitor = + mgr.getAnalyzerOptions().getCheckerBooleanOption( + checker, "AddNoOwnershipChangeNotes"); } bool ento::shouldRegisterDynamicMemoryModeling(const CheckerManager &mgr) { diff --git a/clang/test/Analysis/NewDeleteLeaks.cpp b/clang/test/Analysis/NewDeleteLeaks.cpp new file mode 100644 index 0000000000000..28040d9d0d36b --- /dev/null +++ b/clang/test/Analysis/NewDeleteLeaks.cpp @@ -0,0 +1,142 @@ +// RUN: %clang_analyze_cc1 -verify -analyzer-output=text %s \ +// RUN: -analyzer-checker=core \ +// RUN: -analyzer-checker=cplusplus \ +// RUN: -analyzer-checker=unix \ +// RUN: -analyzer-config \ +// RUN: unix.DynamicMemoryModeling:AddNoOwnershipChangeNotes=false + +// RUN: %clang_analyze_cc1 -verify=expected,ownership -analyzer-output=text %s \ +// RUN: -analyzer-checker=core \ +// RUN: -analyzer-checker=cplusplus \ +// RUN: -analyzer-checker=unix \ +// RUN: -analyzer-config \ +// RUN: unix.DynamicMemoryModeling:AddNoOwnershipChangeNotes=true + +#include "Inputs/system-header-simulator-for-malloc.h" + +//===----------------------------------------------------------------------===// +// Report for which we expect NoOwnershipChangeVisitor to add a new note. +//===----------------------------------------------------------------------===// + +bool coin(); + +namespace memory_allocated_in_fn_call { + +void sink(int *P) { +} // ownership-note {{Returning without deallocating memory or storing the pointer for later deallocation}} + +void foo() { + sink(new int(5)); // expected-note {{Memory is allocated}} + // ownership-note@-1 {{Calling 'sink'}} + // ownership-note@-2 {{Returning from 'sink'}} +} // expected-warning {{Potential memory leak [cplusplus.NewDeleteLeaks]}} +// expected-note@-1 {{Potential memory leak}} + +} // namespace memory_allocated_in_fn_call + +namespace memory_passed_to_fn_call { + +void sink(int *P) { + if (coin()) // ownership-note {{Assuming the condition is false}} + // ownership-note@-1 {{Taking false branch}} + delete P; +} // ownership-note {{Returning without deallocating memory or storing the pointer for later deallocation}} + +void foo() { + int *ptr = new int(5); // expected-note {{Memory is allocated}} + sink(ptr); // ownership-note {{Calling 'sink'}} + // ownership-note@-1 {{Returning from 'sink'}} +} // expected-warning {{Potential leak of memory pointed to by 'ptr' [cplusplus.NewDeleteLeaks]}} +// expected-note@-1 {{Potential leak}} + +} // namespace memory_passed_to_fn_call + +namespace memory_shared_with_ptr_of_shorter_lifetime { + +void sink(int *P) { + int *Q = P; + if (coin()) // ownership-note {{Assuming the condition is false}} + // ownership-note@-1 {{Taking false branch}} + delete P; + (void)Q; +} // ownership-note {{Returning without deallocating memory or storing the pointer for later deallocation}} + +void foo() { + int *ptr = new int(5); // expected-note {{Memory is allocated}} + sink(ptr); // ownership-note {{Calling 'sink'}} + // ownership-note@-1 {{Returning from 'sink'}} +} // expected-warning {{Potential leak of memory pointed to by 'ptr' [cplusplus.NewDeleteLeaks]}} +// expected-note@-1 {{Potential leak}} + +} // namespace memory_shared_with_ptr_of_shorter_lifetime + +//===----------------------------------------------------------------------===// +// Report for which we *do not* expect NoOwnershipChangeVisitor add a new note, +// nor do we want it to. +//===----------------------------------------------------------------------===// + +namespace memory_not_passed_to_fn_call { + +void sink(int *P) { + if (coin()) + delete P; +} + +void foo() { + int *ptr = new int(5); // expected-note {{Memory is allocated}} + int *q = nullptr; + sink(q); + (void)ptr; +} // expected-warning {{Potential leak of memory pointed to by 'ptr' [cplusplus.NewDeleteLeaks]}} +// expected-note@-1 {{Potential leak}} + +} // namespace memory_not_passed_to_fn_call + +namespace memory_shared_with_ptr_of_same_lifetime { + +void sink(int *P, int **Q) { + // NOTE: Not a job of NoOwnershipChangeVisitor, but maybe this could be + // highlighted still? + *Q = P; +} + +void foo() { + int *ptr = new int(5); // expected-note {{Memory is allocated}} + int *q = nullptr; + sink(ptr, &q); +} // expected-warning {{Potential leak of memory pointed to by 'q' [cplusplus.NewDeleteLeaks]}} +// expected-note@-1 {{Potential leak}} + +} // namespace memory_shared_with_ptr_of_same_lifetime + +// TODO: We don't want a note here. sink() doesn't seem like a function that +// even attempts to take care of any memory ownership problems. +namespace memory_passed_into_fn_that_doesnt_intend_to_free { + +void sink(int *P) { +} // ownership-note {{Returning without deallocating memory or storing the pointer for later deallocation}} + +void foo() { + int *ptr = new int(5); // expected-note {{Memory is allocated}} + sink(ptr); // ownership-note {{Calling 'sink'}} + // ownership-note@-1 {{Returning from 'sink'}} +} // expected-warning {{Potential leak of memory pointed to by 'ptr' [cplusplus.NewDeleteLeaks]}} +// expected-note@-1 {{Potential leak}} + +} // namespace memory_passed_into_fn_that_doesnt_intend_to_free + +namespace refkind_from_unoallocated_to_allocated { + +// RefKind of the symbol changed from nothing to Allocated. We don't want to +// emit notes when the RefKind changes in the stack frame. +static char *malloc_wrapper_ret() { + return (char *)malloc(12); // expected-note {{Memory is allocated}} +} +void use_ret() { + char *v; + v = malloc_wrapper_ret(); // expected-note {{Calling 'malloc_wrapper_ret'}} + // expected-note@-1 {{Returned allocated memory}} +} // expected-warning {{Potential leak of memory pointed to by 'v' [unix.Malloc]}} +// expected-note@-1 {{Potential leak of memory pointed to by 'v'}} + +} // namespace refkind_from_unoallocated_to_allocated diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c index d286f1258c21a..2a41e692dd59b 100644 --- a/clang/test/Analysis/analyzer-config.c +++ b/clang/test/Analysis/analyzer-config.c @@ -116,6 +116,7 @@ // CHECK-NEXT: suppress-null-return-paths = true // CHECK-NEXT: track-conditions = true // CHECK-NEXT: track-conditions-debug = false +// CHECK-NEXT: unix.DynamicMemoryModeling:AddNoOwnershipChangeNotes = false // CHECK-NEXT: unix.DynamicMemoryModeling:Optimistic = false // CHECK-NEXT: unroll-loops = false // CHECK-NEXT: widen-loops = false From 547b712500e9bbe2b4184ff3eedd5f5b6a29523b Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Mon, 16 Aug 2021 15:28:56 +0100 Subject: [PATCH 112/700] Suppress signedness-comparison warning This is a follow-up to 54a61c94f93. --- llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index a653c0a994e93..3018e98870f5a 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -3653,7 +3653,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, // If we have an extremely large number of variable assignments and blocks, // bail out at this point. We've burnt some time doing analysis already, // however we should cut our losses. - if (MaxNumBlocks > InputBBLimit && VarAssignCount > InputDbgValLimit) { + if (MaxNumBlocks > (int)InputBBLimit && VarAssignCount > InputDbgValLimit) { LLVM_DEBUG(dbgs() << "Disabling InstrRefBasedLDV: " << MF.getName() << " has " << MaxNumBlocks << " basic blocks and " << VarAssignCount From a19747ea7395dd470345e4703f13bbb74647b019 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Mon, 16 Aug 2021 11:53:30 +0100 Subject: [PATCH 113/700] Fix type in DenseMap to match V.size() Differential Revision: https://reviews.llvm.org/D108124 --- llvm/include/llvm/ADT/SmallBitVector.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/ADT/SmallBitVector.h b/llvm/include/llvm/ADT/SmallBitVector.h index f570bac23ad51..c70bc88fb1f24 100644 --- a/llvm/include/llvm/ADT/SmallBitVector.h +++ b/llvm/include/llvm/ADT/SmallBitVector.h @@ -721,7 +721,7 @@ template <> struct DenseMapInfo { } static unsigned getHashValue(const SmallBitVector &V) { uintptr_t Store; - return DenseMapInfo>>::getHashValue( + return DenseMapInfo>>::getHashValue( std::make_pair(V.size(), V.getData(Store))); } static bool isEqual(const SmallBitVector &LHS, const SmallBitVector &RHS) { From de285eacb0113f7be32163f0166ef2c0faf18d65 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 16 Aug 2021 10:28:46 -0400 Subject: [PATCH 114/700] [InstCombine] allow for constant-folding in GEP transform This would crash the reduced test or as described in https://llvm.org/PR51485 ...because we can't mark a constant (-expression) with 'inbounds'. --- .../InstCombine/InstructionCombining.cpp | 10 ++++---- .../InstCombine/gep-combine-loop-invariant.ll | 23 +++++++++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 434659cf4b2c7..1a803749cdbb8 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2118,10 +2118,12 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { // -- have to recreate %src & %gep // put NewSrc at same location as %src Builder.SetInsertPoint(cast(PtrOp)); - auto *NewSrc = cast( - Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName())); - NewSrc->setIsInBounds(Src->isInBounds()); - auto *NewGEP = + Value *NewSrc = + Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName()); + // Propagate 'inbounds' if the new source was not constant-folded. + if (auto *NewSrcGEPI = dyn_cast(NewSrc)) + NewSrcGEPI->setIsInBounds(Src->isInBounds()); + GetElementPtrInst *NewGEP = GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1}); NewGEP->setIsInBounds(GEP.isInBounds()); return NewGEP; diff --git a/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll b/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll index faaaff8fec91a..58d16e1485d09 100644 --- a/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll +++ b/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll @@ -186,3 +186,26 @@ loop: call void @blackhole(<2 x i8*> %e6) br label %loop } + +; This would crash because we did not expect to be able to constant fold a GEP. + +define void @PR51485(<2 x i64> %v) { +; CHECK-LABEL: @PR51485( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[SL1:%.*]] = shl nuw nsw <2 x i64> [[V:%.*]], +; CHECK-NEXT: [[E6:%.*]] = getelementptr inbounds i8, i8* getelementptr (i8, i8* bitcast (void (<2 x i64>)* @PR51485 to i8*), i64 80), <2 x i64> [[SL1]] +; CHECK-NEXT: call void @blackhole(<2 x i8*> [[E6]]) +; CHECK-NEXT: br label [[LOOP]] +; +entry: + br label %loop + +loop: + %sl1 = shl nuw nsw <2 x i64> %v, + %e5 = getelementptr inbounds i8, i8* bitcast (void (<2 x i64>)* @PR51485 to i8*), <2 x i64> %sl1 + %e6 = getelementptr inbounds i8, <2 x i8*> %e5, i64 80 + call void @blackhole(<2 x i8*> %e6) + br label %loop +} From 95fe61e63954efc63b1152a528fefddf0b6e2848 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Mon, 16 Aug 2021 15:43:38 +0100 Subject: [PATCH 115/700] Revert 54a61c94f93 and its follow up in 547b712500e These were part of D107823, however asan has found something excitingly wrong happening: https://lab.llvm.org/buildbot/#/builders/5/builds/10543/steps/13/logs/stdio --- .../LiveDebugValues/InstrRefBasedImpl.cpp | 21 +++---------------- .../LiveDebugValues/LiveDebugValues.cpp | 14 +------------ .../CodeGen/LiveDebugValues/LiveDebugValues.h | 4 +--- .../LiveDebugValues/VarLocBasedImpl.cpp | 21 +++++++++++++------ .../MIR/X86/live-debug-values-cutoffs.mir | 20 ------------------ 5 files changed, 20 insertions(+), 60 deletions(-) diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index 3018e98870f5a..dc99070583406 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -1684,8 +1684,7 @@ class InstrRefBasedLDV : public LDVImpl { /// RPOT block ordering. void initialSetup(MachineFunction &MF); - bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, - unsigned InputBBLimit, unsigned InputDbgValLimit) override; + bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) override; public: /// Default construct and initialize the pass. @@ -3524,9 +3523,8 @@ void InstrRefBasedLDV::initialSetup(MachineFunction &MF) { /// Calculate the liveness information for the given machine function and /// extend ranges across basic blocks. -bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, - unsigned InputBBLimit, - unsigned InputDbgValLimit) { +bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, + TargetPassConfig *TPC) { // No subprogram means this function contains no debuginfo. if (!MF.getFunction().getSubprogram()) return false; @@ -3628,7 +3626,6 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, // To mirror old LiveDebugValues, enumerate variables in RPOT order. Otherwise // the order is unimportant, it just has to be stable. - unsigned VarAssignCount = 0; for (unsigned int I = 0; I < OrderToBB.size(); ++I) { auto *MBB = OrderToBB[I]; auto *VTracker = &vlocs[MBB->getNumber()]; @@ -3646,21 +3643,9 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, ScopeToVars[Scope].insert(Var); ScopeToBlocks[Scope].insert(VTracker->MBB); ScopeToDILocation[Scope] = ScopeLoc; - ++VarAssignCount; } } - // If we have an extremely large number of variable assignments and blocks, - // bail out at this point. We've burnt some time doing analysis already, - // however we should cut our losses. - if (MaxNumBlocks > (int)InputBBLimit && VarAssignCount > InputDbgValLimit) { - LLVM_DEBUG(dbgs() << "Disabling InstrRefBasedLDV: " << MF.getName() - << " has " << MaxNumBlocks << " basic blocks and " - << VarAssignCount - << " variable assignments, exceeding limits.\n"); - return false; - } - // OK. Iterate over scopes: there might be something to be said for // ordering them by size/locality, but that's for the future. For each scope, // solve the variable value problem, producing a map of variables to values diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp index bc1eaff60440f..38e803d1abb55 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp @@ -40,18 +40,6 @@ static cl::opt "normal DBG_VALUE inputs"), cl::init(false)); -// Options to prevent pathological compile-time behavior. If InputBBLimit and -// InputDbgValueLimit are both exceeded, range extension is disabled. -static cl::opt InputBBLimit( - "livedebugvalues-input-bb-limit", - cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"), - cl::init(10000), cl::Hidden); -static cl::opt InputDbgValueLimit( - "livedebugvalues-input-dbg-value-limit", - cl::desc( - "Maximum input DBG_VALUE insts supported by debug range extension"), - cl::init(50000), cl::Hidden); - /// Generic LiveDebugValues pass. Calls through to VarLocBasedLDV or /// InstrRefBasedLDV to perform location propagation, via the LDVImpl /// base class. @@ -115,5 +103,5 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) { TheImpl = llvm::makeVarLocBasedLiveDebugValues(); } - return TheImpl->ExtendRanges(MF, TPC, InputBBLimit, InputDbgValueLimit); + return TheImpl->ExtendRanges(MF, TPC); } diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h index e38360b08bafa..9c910f180b9fb 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h +++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h @@ -23,9 +23,7 @@ inline namespace SharedLiveDebugValues { // implementation. class LDVImpl { public: - virtual bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, - unsigned InputBBLimit, - unsigned InputDbgValLimit) = 0; + virtual bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) = 0; virtual ~LDVImpl() {} }; diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp index 977d3ede5c776..1e6d65c189535 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp @@ -166,6 +166,18 @@ using namespace llvm; STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted"); +// Options to prevent pathological compile-time behavior. If InputBBLimit and +// InputDbgValueLimit are both exceeded, range extension is disabled. +static cl::opt InputBBLimit( + "livedebugvalues-input-bb-limit", + cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"), + cl::init(10000), cl::Hidden); +static cl::opt InputDbgValueLimit( + "livedebugvalues-input-dbg-value-limit", + cl::desc( + "Maximum input DBG_VALUE insts supported by debug range extension"), + cl::init(50000), cl::Hidden); + /// If \p Op is a stack or frame register return true, otherwise return false. /// This is used to avoid basing the debug entry values on the registers, since /// we do not support it at the moment. @@ -995,8 +1007,7 @@ class VarLocBasedLDV : public LDVImpl { /// had their instruction creation deferred. void flushPendingLocs(VarLocInMBB &PendingInLocs, VarLocMap &VarLocIDs); - bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, - unsigned InputBBLimit, unsigned InputDbgValLimit) override; + bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) override; public: /// Default construct and initialize the pass. @@ -2037,9 +2048,7 @@ void VarLocBasedLDV::recordEntryValue(const MachineInstr &MI, /// Calculate the liveness information for the given machine function and /// extend ranges across basic blocks. -bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, - unsigned InputBBLimit, - unsigned InputDbgValLimit) { +bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) { LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n"); if (!MF.getFunction().getSubprogram()) @@ -2132,7 +2141,7 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, for (auto &MI : MBB) if (MI.isDebugValue()) ++NumInputDbgValues; - if (NumInputDbgValues > InputDbgValLimit) { + if (NumInputDbgValues > InputDbgValueLimit) { LLVM_DEBUG(dbgs() << "Disabling VarLocBasedLDV: " << MF.getName() << " has " << RPONumber << " basic blocks and " << NumInputDbgValues diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir index 17b6b9b3149c3..4922c36086f16 100644 --- a/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir +++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir @@ -5,41 +5,21 @@ # RUN: -livedebugvalues-input-bb-limit=1 \ # RUN: -livedebugvalues-input-dbg-value-limit=1 \ # RUN: | FileCheck %s -check-prefix=LDV-DISABLED -# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ -# RUN: -experimental-debug-variable-locations \ -# RUN: -livedebugvalues-input-bb-limit=1 \ -# RUN: -livedebugvalues-input-dbg-value-limit=1 \ -# RUN: | FileCheck %s -check-prefix=LDV-DISABLED # RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ # RUN: -livedebugvalues-input-bb-limit=1 \ # RUN: -livedebugvalues-input-dbg-value-limit=10 \ # RUN: | FileCheck %s -check-prefix=LDV-ENABLED -# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ -# RUN: -experimental-debug-variable-locations \ -# RUN: -livedebugvalues-input-bb-limit=1 \ -# RUN: -livedebugvalues-input-dbg-value-limit=10 \ -# RUN: | FileCheck %s -check-prefix=LDV-ENABLED # RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ # RUN: -livedebugvalues-input-bb-limit=10 \ # RUN: -livedebugvalues-input-dbg-value-limit=1 \ # RUN: | FileCheck %s -check-prefix=LDV-ENABLED -# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ -# RUN: -experimental-debug-variable-locations \ -# RUN: -livedebugvalues-input-bb-limit=10 \ -# RUN: -livedebugvalues-input-dbg-value-limit=1 \ -# RUN: | FileCheck %s -check-prefix=LDV-ENABLED # RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ # RUN: -livedebugvalues-input-bb-limit=10 \ # RUN: -livedebugvalues-input-dbg-value-limit=10 \ # RUN: | FileCheck %s -check-prefix=LDV-ENABLED -# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ -# RUN: -experimental-debug-variable-locations \ -# RUN: -livedebugvalues-input-bb-limit=10 \ -# RUN: -livedebugvalues-input-dbg-value-limit=10 \ -# RUN: | FileCheck %s -check-prefix=LDV-ENABLED # LDV-DISABLED-LABEL: bb.1.exit # LDV-DISABLED-NEXT: $edi = MOV32rm From 6eeb4c1f3203566599c12a5f2b147c6c5a92ee1d Mon Sep 17 00:00:00 2001 From: "Peyton, Jonathan L" Date: Tue, 13 Jul 2021 14:42:27 -0500 Subject: [PATCH 116/700] [OpenMP] Fix incorrect parameters to sscanf_s call On Windows, the documentation states that when using sscanf_s, each %c and %s specifier must also have additional size parameter. This patch adds the size parameter in the one place where %c is used. Differential Revision: https://reviews.llvm.org/D105931 --- openmp/runtime/src/kmp_settings.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp index a98a2a43b0d06..b4b8fbe1aa0ad 100644 --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -164,7 +164,12 @@ int __kmp_convert_to_milliseconds(char const *data) { return (INT_MAX); value = (double)0.0; mult = '\0'; +#if KMP_OS_WINDOWS + // On Windows, each %c parameter needs additional size parameter for sscanf_s + nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, 1, &extra, 1); +#else nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, &extra); +#endif if (nvalues < 1) return (-1); if (nvalues == 1) From b4a1f441d90c5cb324efe45170614957ab666382 Mon Sep 17 00:00:00 2001 From: "Peyton, Jonathan L" Date: Mon, 28 Jun 2021 16:27:27 -0500 Subject: [PATCH 117/700] [OpenMP] Add a few small fixes * Add comment to help ensure new construct data are added in two places * Check for division by zero in the loop worksharing code * Check for syntax errors in parrange parsing Differential Revision: https://reviews.llvm.org/D105929 --- openmp/runtime/src/kmp.h | 2 ++ openmp/runtime/src/kmp_dispatch.cpp | 3 ++- openmp/runtime/src/kmp_settings.cpp | 23 ++++++++++++++++++----- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index dbf1111a8976f..a815ee862a7e9 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1439,6 +1439,8 @@ __kmp_mm_mwait(unsigned extensions, unsigned hints) { /* Support datatypes for the orphaned construct nesting checks. */ /* ------------------------------------------------------------------------ */ +/* When adding to this enum, add its corresponding string in cons_text_c[] + * array in kmp_error.cpp */ enum cons_type { ct_none, ct_parallel, diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp index c97ffb2dd336c..0a75f8a54f595 100644 --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -561,6 +561,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, _control87(_PC_64, _MCW_PC); // 0,0x30000 #endif /* value used for comparison in solver for cross-over point */ + KMP_ASSERT(tc > 0); long double target = ((long double)chunk * 2 + 1) * nproc / tc; /* crossover point--chunk indexes equal to or greater than @@ -1715,7 +1716,7 @@ int __kmp_dispatch_next_algorithm(int gtid, status = 0; // nothing to do, don't try atomic op break; } - KMP_DEBUG_ASSERT(init % chunk == 0); + KMP_DEBUG_ASSERT(chunk && init % chunk == 0); // compare with K*nproc*(chunk+1), K=2 by default if ((T)remaining < pr->u.p.parm2) { // use dynamic-style schedule diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp index b4b8fbe1aa0ad..3480872783455 100644 --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -431,6 +431,7 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value, int *out_range, char *out_routine, char *out_file, int *out_lb, int *out_ub) { + const char *par_range_value; size_t len = KMP_STRLEN(value) + 1; par_range_to_print = (char *)KMP_INTERNAL_MALLOC(len + 1); KMP_STRNCPY_S(par_range_to_print, len + 1, value, len + 1); @@ -439,11 +440,14 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value, __kmp_par_range_ub = INT_MAX; for (;;) { unsigned int len; - if (*value == '\0') { + if (!value || *value == '\0') { break; } if (!__kmp_strcasecmp_with_sentinel("routine", value, '=')) { - value = strchr(value, '=') + 1; + par_range_value = strchr(value, '=') + 1; + if (!par_range_value) + goto par_range_error; + value = par_range_value; len = __kmp_readstr_with_sentinel(out_routine, value, KMP_PAR_RANGE_ROUTINE_LEN - 1, ','); if (len == 0) { @@ -456,7 +460,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value, continue; } if (!__kmp_strcasecmp_with_sentinel("filename", value, '=')) { - value = strchr(value, '=') + 1; + par_range_value = strchr(value, '=') + 1; + if (!par_range_value) + goto par_range_error; + value = par_range_value; len = __kmp_readstr_with_sentinel(out_file, value, KMP_PAR_RANGE_FILENAME_LEN - 1, ','); if (len == 0) { @@ -470,7 +477,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value, } if ((!__kmp_strcasecmp_with_sentinel("range", value, '=')) || (!__kmp_strcasecmp_with_sentinel("incl_range", value, '='))) { - value = strchr(value, '=') + 1; + par_range_value = strchr(value, '=') + 1; + if (!par_range_value) + goto par_range_error; + value = par_range_value; if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) { goto par_range_error; } @@ -482,7 +492,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value, continue; } if (!__kmp_strcasecmp_with_sentinel("excl_range", value, '=')) { - value = strchr(value, '=') + 1; + par_range_value = strchr(value, '=') + 1; + if (!par_range_value) + goto par_range_error; + value = par_range_value; if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) { goto par_range_error; } From d6fe8d37c68d9fc7a23a3fb15cdda7102d16ce69 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 16 Aug 2021 15:53:31 +0100 Subject: [PATCH 118/700] [DAG] Fold concat_vectors(concat_vectors(x,y),concat_vectors(a,b)) -> concat_vectors(x,y,a,b) Follow-up to D107068, attempt to fold nested concat_vectors/undefs, as long as both the vector and inner subvector types are legal. This exposed the same issue in ARM's MVE LowerCONCAT_VECTORS_i1 (raised as PR51365) and AArch64's performConcatVectorsCombine which both assumed concat_vectors only took 2 subvector operands. Differential Revision: https://reviews.llvm.org/D107597 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 48 +++- .../Target/AArch64/AArch64ISelLowering.cpp | 29 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 96 ++++--- .../rvv/fixed-vectors-extload-truncstore.ll | 258 +++++++----------- .../RISCV/rvv/fixed-vectors-fp-conv.ll | 28 +- .../CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll | 188 +++++-------- .../CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll | 56 ++-- llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll | 38 ++- .../X86/merge-consecutive-loads-512.ll | 20 +- .../CodeGen/X86/vector-shuffle-512-v16.ll | 4 +- .../CodeGen/X86/x86-interleaved-access.ll | 14 +- 11 files changed, 359 insertions(+), 420 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index bf83ccf535a9f..1c56d9e475136 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19865,6 +19865,44 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops)); } +// Attempt to merge nested concat_vectors/undefs. +// Fold concat_vectors(concat_vectors(x,y,z,w),u,u,concat_vectors(a,b,c,d)) +// --> concat_vectors(x,y,z,w,u,u,u,u,u,u,u,u,a,b,c,d) +static SDValue combineConcatVectorOfConcatVectors(SDNode *N, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + // Ensure we're concatenating UNDEF and CONCAT_VECTORS nodes of similar types. + EVT SubVT; + SDValue FirstConcat; + for (const SDValue &Op : N->ops()) { + if (Op.isUndef()) + continue; + if (Op.getOpcode() != ISD::CONCAT_VECTORS) + return SDValue(); + if (!FirstConcat) { + SubVT = Op.getOperand(0).getValueType(); + if (!DAG.getTargetLoweringInfo().isTypeLegal(SubVT)) + return SDValue(); + FirstConcat = Op; + continue; + } + if (SubVT != Op.getOperand(0).getValueType()) + return SDValue(); + } + assert(FirstConcat && "Concat of all-undefs found"); + + SmallVector ConcatOps; + for (const SDValue &Op : N->ops()) { + if (Op.isUndef()) { + ConcatOps.append(FirstConcat->getNumOperands(), DAG.getUNDEF(SubVT)); + continue; + } + ConcatOps.append(Op->op_begin(), Op->op_end()); + } + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, ConcatOps); +} + // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR // operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at // most two distinct vectors the same size as the result, attempt to turn this @@ -20124,13 +20162,19 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { } // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR. + // FIXME: Add support for concat_vectors(bitcast(vec0),bitcast(vec1),...). if (SDValue V = combineConcatVectorOfScalars(N, DAG)) return V; - // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE. - if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) + if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) { + // Fold CONCAT_VECTORS of CONCAT_VECTORS (or undef) to VECTOR_SHUFFLE. + if (SDValue V = combineConcatVectorOfConcatVectors(N, DAG)) + return V; + + // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE. if (SDValue V = combineConcatVectorOfExtracts(N, DAG)) return V; + } if (SDValue V = combineConcatVectorOfCasts(N, DAG)) return V; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index baa3feda74d3a..5497956c4580d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10459,8 +10459,29 @@ SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, isTypeLegal(Op.getValueType()) && "Expected legal scalable vector type!"); - if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2) - return Op; + if (isTypeLegal(Op.getOperand(0).getValueType())) { + unsigned NumOperands = Op->getNumOperands(); + assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && + "Unexpected number of operands in CONCAT_VECTORS"); + + if (Op.getNumOperands() == 2) + return Op; + + // Concat each pair of subvectors and pack into the lower half of the array. + SmallVector ConcatOps(Op->op_begin(), Op->op_end()); + while (ConcatOps.size() > 1) { + for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) { + SDValue V1 = ConcatOps[I]; + SDValue V2 = ConcatOps[I + 1]; + EVT SubVT = V1.getValueType(); + EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext()); + ConcatOps[I / 2] = + DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2); + } + ConcatOps.resize(ConcatOps.size() / 2); + } + return ConcatOps[0]; + } return SDValue(); } @@ -13621,7 +13642,7 @@ static SDValue performConcatVectorsCombine(SDNode *N, // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. - if (N0 == N1 && VT.getVectorNumElements() == 2) { + if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) { assert(VT.getScalarSizeInBits() == 64); return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), DAG.getConstant(0, dl, MVT::i64)); @@ -13636,7 +13657,7 @@ static SDValue performConcatVectorsCombine(SDNode *N, // becomes // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) - if (N1Opc != ISD::BITCAST) + if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST) return SDValue(); SDValue RHS = N1->getOperand(0); MVT RHSTy = RHS.getValueType().getSimpleVT(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 715725aa093a4..e3f5f4948bf64 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -8824,54 +8824,68 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); SDLoc dl(Op); - EVT VT = Op.getValueType(); - EVT Op1VT = V1.getValueType(); - EVT Op2VT = V2.getValueType(); - unsigned NumElts = VT.getVectorNumElements(); - - assert(Op1VT == Op2VT && "Operand types don't match!"); - assert(VT.getScalarSizeInBits() == 1 && + assert(Op.getValueType().getScalarSizeInBits() == 1 && + "Unexpected custom CONCAT_VECTORS lowering"); + assert(isPowerOf2_32(Op.getNumOperands()) && "Unexpected custom CONCAT_VECTORS lowering"); assert(ST->hasMVEIntegerOps() && "CONCAT_VECTORS lowering only supported for MVE"); - SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); - SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); - - // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets - // promoted to v8i16, etc. - - MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); - - // Extract the vector elements from Op1 and Op2 one by one and truncate them - // to be the right size for the destination. For example, if Op1 is v4i1 then - // the promoted vector is v4i32. The result of concatentation gives a v8i1, - // which when promoted is v8i16. That means each i32 element from Op1 needs - // truncating to i16 and inserting in the result. - EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); - SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); - auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { - EVT NewVT = NewV.getValueType(); - EVT ConcatVT = ConVec.getValueType(); - for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, - DAG.getIntPtrConstant(i, dl)); - ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, - DAG.getConstant(j, dl, MVT::i32)); - } - return ConVec; + auto ConcatPair = [&](SDValue V1, SDValue V2) { + EVT Op1VT = V1.getValueType(); + EVT Op2VT = V2.getValueType(); + assert(Op1VT == Op2VT && "Operand types don't match!"); + EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext()); + + SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); + SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); + + // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets + // promoted to v8i16, etc. + MVT ElType = + getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); + unsigned NumElts = 2 * Op1VT.getVectorNumElements(); + + // Extract the vector elements from Op1 and Op2 one by one and truncate them + // to be the right size for the destination. For example, if Op1 is v4i1 + // then the promoted vector is v4i32. The result of concatentation gives a + // v8i1, which when promoted is v8i16. That means each i32 element from Op1 + // needs truncating to i16 and inserting in the result. + EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); + SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); + auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { + EVT NewVT = NewV.getValueType(); + EVT ConcatVT = ConVec.getValueType(); + for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, + DAG.getIntPtrConstant(i, dl)); + ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, + DAG.getConstant(j, dl, MVT::i32)); + } + return ConVec; + }; + unsigned j = 0; + ConVec = ExtractInto(NewV1, ConVec, j); + ConVec = ExtractInto(NewV2, ConVec, j); + + // Now return the result of comparing the subvector with zero, + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); }; - unsigned j = 0; - ConVec = ExractInto(NewV1, ConVec, j); - ConVec = ExractInto(NewV2, ConVec, j); - // Now return the result of comparing the subvector with zero, - // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. - return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, - DAG.getConstant(ARMCC::NE, dl, MVT::i32)); + // Concat each pair of subvectors and pack into the lower half of the array. + SmallVector ConcatOps(Op->op_begin(), Op->op_end()); + while (ConcatOps.size() > 1) { + for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) { + SDValue V1 = ConcatOps[I]; + SDValue V2 = ConcatOps[I + 1]; + ConcatOps[I / 2] = ConcatPair(V1, V2); + } + ConcatOps.resize(ConcatOps.size() / 2); + } + return ConcatOps[0]; } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll index 15627a32145ea..cf14b88e93757 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll @@ -1290,37 +1290,30 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, <16 x i8>* %z) { ; LMULMAX1-NEXT: vnsrl.wi v25, v8, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-NEXT: vmv.v.i v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vmv1r.v v27, v26 -; LMULMAX1-NEXT: vslideup.vi v27, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 4, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v9, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v25, 4 -; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-NEXT: vmv.v.i v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v27, 0 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v10, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v10, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 12, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 8 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v11, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v11, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 4 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v26, 8 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 12 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; LMULMAX1-NEXT: vse8.v v25, (a0) +; LMULMAX1-NEXT: vse8.v v26, (a0) ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: truncstore_v16i32_v16i8: @@ -1624,43 +1617,36 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %z) { ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmv.v.i v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v27, v26 -; LMULMAX1-NEXT: vslideup.vi v27, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v9, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v25, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v27, 0 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v10, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v10, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v11, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v11, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 2 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v26, 4 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 6 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; LMULMAX1-NEXT: vse8.v v25, (a0) +; LMULMAX1-NEXT: vse8.v v26, (a0) ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: truncstore_v8i64_v8i8: @@ -1685,37 +1671,30 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %z) { ; LMULMAX1-NEXT: vnsrl.wi v25, v8, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vmv.v.i v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; LMULMAX1-NEXT: vmv1r.v v27, v26 -; LMULMAX1-NEXT: vslideup.vi v27, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v9, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v25, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-NEXT: vmv.v.i v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v27, 0 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v10, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v10, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v11, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v11, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 2 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v26, 4 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 6 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; LMULMAX1-NEXT: vse16.v v25, (a0) +; LMULMAX1-NEXT: vse16.v v26, (a0) ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: truncstore_v8i64_v8i16: @@ -1779,88 +1758,68 @@ define void @truncstore_v16i64_v16i8(<16 x i64> %x, <16 x i8>* %z) { ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v26, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-NEXT: vmv.v.i v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v27, v25 -; LMULMAX1-NEXT: vslideup.vi v27, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v26, v9, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v26, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vmv1r.v v28, v26 -; LMULMAX1-NEXT: vslideup.vi v28, v27, 0 +; LMULMAX1-NEXT: vsetivli zero, 4, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v10, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v10, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v29, v25 -; LMULMAX1-NEXT: vslideup.vi v29, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v11, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v11, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v27, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v28, v29, 4 -; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-NEXT: vmv.v.i v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v28, 0 +; LMULMAX1-NEXT: vslideup.vi v25, v26, 6 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v12, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v12, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v29, v25 -; LMULMAX1-NEXT: vslideup.vi v29, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 10, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 8 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v13, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v13, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v28, 2 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v29, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 12, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 10 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v14, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v14, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 14, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 12 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v15, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v15, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v28, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v25, 4 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v26, 8 +; LMULMAX1-NEXT: vslideup.vi v25, v26, 14 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; LMULMAX1-NEXT: vse8.v v27, (a0) +; LMULMAX1-NEXT: vse8.v v25, (a0) ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: truncstore_v16i64_v16i8: @@ -1897,67 +1856,54 @@ define void @truncstore_v16i64_v16i16(<16 x i64> %x, <16 x i16>* %z) { ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v8, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v26, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; LMULMAX1-NEXT: vmv1r.v v28, v25 -; LMULMAX1-NEXT: vslideup.vi v28, v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v26, v9, 0 -; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v28, v26, 2 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vmv.v.i v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; LMULMAX1-NEXT: vsetivli zero, 2, e16, m1, tu, mu ; LMULMAX1-NEXT: vmv1r.v v27, v26 -; LMULMAX1-NEXT: vslideup.vi v27, v28, 0 +; LMULMAX1-NEXT: vslideup.vi v27, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v10, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v9, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; LMULMAX1-NEXT: vmv1r.v v29, v25 -; LMULMAX1-NEXT: vslideup.vi v29, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v27, v25, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v11, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v10, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v28, 2 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v27, v25, 4 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; LMULMAX1-NEXT: vnsrl.wi v25, v11, 0 +; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v29, 4 +; LMULMAX1-NEXT: vslideup.vi v27, v25, 6 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v12, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v12, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; LMULMAX1-NEXT: vmv1r.v v29, v25 -; LMULMAX1-NEXT: vslideup.vi v29, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v13, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v13, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v28, 2 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v29, 0 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v14, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v14, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v15, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v15, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v28, 2 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v25, 4 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 6 ; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; LMULMAX1-NEXT: vse16.v v26, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll index 91fa3c4a48b55..6b1130a072007 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll @@ -196,15 +196,15 @@ define void @fpround_v8f64_v8f16(<8 x double>* %x, <8 x half>* %y) { ; ; LMULMAX1-LABEL: fpround_v8f64_v8f16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -48 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 48 +; LMULMAX1-NEXT: addi sp, sp, -32 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vle64.v v25, (a0) -; LMULMAX1-NEXT: addi a2, a0, 32 +; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: vle64.v v26, (a2) ; LMULMAX1-NEXT: addi a2, a0, 48 ; LMULMAX1-NEXT: vle64.v v27, (a2) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a0, a0, 32 ; LMULMAX1-NEXT: vle64.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rod.f.f.w v29, v27 @@ -216,35 +216,25 @@ define void @fpround_v8f64_v8f16(<8 x double>* %x, <8 x half>* %y) { ; LMULMAX1-NEXT: vfncvt.rod.f.f.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v28, v27 -; LMULMAX1-NEXT: addi a0, sp, 20 +; LMULMAX1-NEXT: addi a0, sp, 24 ; LMULMAX1-NEXT: vse16.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rod.f.f.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v26, v27 -; LMULMAX1-NEXT: addi a0, sp, 24 -; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 24 -; LMULMAX1-NEXT: vle16.v v26, (a0) -; LMULMAX1-NEXT: addi a0, sp, 40 +; LMULMAX1-NEXT: addi a0, sp, 20 ; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rod.f.f.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v25, v26 ; LMULMAX1-NEXT: addi a0, sp, 16 ; LMULMAX1-NEXT: vse16.v v25, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 16 -; LMULMAX1-NEXT: vle16.v v25, (a0) -; LMULMAX1-NEXT: addi a0, sp, 32 -; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 32 +; LMULMAX1-NEXT: addi a0, sp, 16 ; LMULMAX1-NEXT: vle16.v v25, (a0) ; LMULMAX1-NEXT: vse16.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 48 +; LMULMAX1-NEXT: addi sp, sp, 32 ; LMULMAX1-NEXT: ret %a = load <8 x double>, <8 x double>* %x %d = fptrunc <8 x double> %a to <8 x half> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll index 1846ffb983605..9646e6eada6fd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll @@ -448,43 +448,36 @@ define void @fp2si_v8f64_v8i8(<8 x double>* %x, <8 x i8>* %y) { ; LMULMAX1-NEXT: vnsrl.wi v27, v29, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmv.v.i v29, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v30, v29 -; LMULMAX1-NEXT: vslideup.vi v30, v27, 0 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v29, v27, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v30, v27, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v27, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v30, 0 +; LMULMAX1-NEXT: vslideup.vi v29, v27, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v28, v26 +; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v26, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v27, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v29, v26, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v25, 2 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v29, 4 +; LMULMAX1-NEXT: vslideup.vi v29, v25, 6 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; LMULMAX1-NEXT: vse8.v v27, (a1) +; LMULMAX1-NEXT: vse8.v v29, (a1) ; LMULMAX1-NEXT: ret %a = load <8 x double>, <8 x double>* %x %d = fptosi <8 x double> %a to <8 x i8> @@ -522,43 +515,36 @@ define void @fp2ui_v8f64_v8i8(<8 x double>* %x, <8 x i8>* %y) { ; LMULMAX1-NEXT: vnsrl.wi v27, v29, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmv.v.i v29, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v30, v29 -; LMULMAX1-NEXT: vslideup.vi v30, v27, 0 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v29, v27, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v30, v27, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v27, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v30, 0 +; LMULMAX1-NEXT: vslideup.vi v29, v27, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v28, v26 +; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v26, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v27, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v29, v26, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v25, 2 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v29, 4 +; LMULMAX1-NEXT: vslideup.vi v29, v25, 6 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; LMULMAX1-NEXT: vse8.v v27, (a1) +; LMULMAX1-NEXT: vse8.v v29, (a1) ; LMULMAX1-NEXT: ret %a = load <8 x double>, <8 x double>* %x %d = fptoui <8 x double> %a to <8 x i8> @@ -577,7 +563,7 @@ define <8 x i1> @fp2si_v8f64_v8i1(<8 x double> %x) { ; ; LMULMAX1-LABEL: fp2si_v8f64_v8i1: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmv.v.i v25, 0 ; LMULMAX1-NEXT: vmclr.m v0 ; LMULMAX1-NEXT: vmerge.vim v26, v25, 1, v0 @@ -588,58 +574,43 @@ define <8 x i1> @fp2si_v8f64_v8i1(<8 x double> %x) { ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vmv.v.i v27, 0 ; LMULMAX1-NEXT: vmerge.vim v28, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v29, v26 -; LMULMAX1-NEXT: vslideup.vi v29, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 -; LMULMAX1-NEXT: vmerge.vim v28, v25, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v28, 0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 +; LMULMAX1-NEXT: vmerge.vim v26, v25, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v29, v9 -; LMULMAX1-NEXT: vand.vi v29, v29, 1 -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmerge.vim v29, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v28, v29, 2 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v28, v9 +; LMULMAX1-NEXT: vand.vi v28, v28, 1 ; LMULMAX1-NEXT: vmsne.vi v0, v28, 0 -; LMULMAX1-NEXT: vmerge.vim v28, v25, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v29, 0 -; LMULMAX1-NEXT: vmclr.m v0 -; LMULMAX1-NEXT: vmerge.vim v30, v29, 1, v0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; LMULMAX1-NEXT: vmerge.vim v28, v27, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v30, v28, 0 +; LMULMAX1-NEXT: vslideup.vi v26, v28, 2 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v30, 0 -; LMULMAX1-NEXT: vmerge.vim v28, v29, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v29, v10 -; LMULMAX1-NEXT: vand.vi v29, v29, 1 -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmerge.vim v29, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v29, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 ; LMULMAX1-NEXT: vmerge.vim v26, v25, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v29, v11 -; LMULMAX1-NEXT: vand.vi v29, v29, 1 -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 +; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v28, v10 +; LMULMAX1-NEXT: vand.vi v28, v28, 1 +; LMULMAX1-NEXT: vmsne.vi v0, v28, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmerge.vim v27, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 2 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; LMULMAX1-NEXT: vmerge.vim v28, v27, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 6, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v28, 4 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 ; LMULMAX1-NEXT: vmerge.vim v25, v25, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v26, v11 +; LMULMAX1-NEXT: vand.vi v26, v26, 1 +; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; LMULMAX1-NEXT: vmerge.vim v26, v27, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v28, v25, 4 +; LMULMAX1-NEXT: vslideup.vi v25, v26, 6 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v28, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v25, 0 ; LMULMAX1-NEXT: ret %z = fptosi <8 x double> %x to <8 x i1> ret <8 x i1> %z @@ -656,7 +627,7 @@ define <8 x i1> @fp2ui_v8f64_v8i1(<8 x double> %x) { ; ; LMULMAX1-LABEL: fp2ui_v8f64_v8i1: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmv.v.i v25, 0 ; LMULMAX1-NEXT: vmclr.m v0 ; LMULMAX1-NEXT: vmerge.vim v26, v25, 1, v0 @@ -667,58 +638,43 @@ define <8 x i1> @fp2ui_v8f64_v8i1(<8 x double> %x) { ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vmv.v.i v27, 0 ; LMULMAX1-NEXT: vmerge.vim v28, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v29, v26 -; LMULMAX1-NEXT: vslideup.vi v29, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 -; LMULMAX1-NEXT: vmerge.vim v28, v25, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v28, 0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 +; LMULMAX1-NEXT: vmerge.vim v26, v25, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v29, v9 -; LMULMAX1-NEXT: vand.vi v29, v29, 1 -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmerge.vim v29, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v28, v29, 2 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v28, v9 +; LMULMAX1-NEXT: vand.vi v28, v28, 1 ; LMULMAX1-NEXT: vmsne.vi v0, v28, 0 -; LMULMAX1-NEXT: vmerge.vim v28, v25, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v29, 0 -; LMULMAX1-NEXT: vmclr.m v0 -; LMULMAX1-NEXT: vmerge.vim v30, v29, 1, v0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; LMULMAX1-NEXT: vmerge.vim v28, v27, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v30, v28, 0 +; LMULMAX1-NEXT: vslideup.vi v26, v28, 2 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v30, 0 -; LMULMAX1-NEXT: vmerge.vim v28, v29, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v29, v10 -; LMULMAX1-NEXT: vand.vi v29, v29, 1 -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmerge.vim v29, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v29, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 ; LMULMAX1-NEXT: vmerge.vim v26, v25, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v29, v11 -; LMULMAX1-NEXT: vand.vi v29, v29, 1 -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 +; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v28, v10 +; LMULMAX1-NEXT: vand.vi v28, v28, 1 +; LMULMAX1-NEXT: vmsne.vi v0, v28, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmerge.vim v27, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 2 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; LMULMAX1-NEXT: vmerge.vim v28, v27, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 6, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v28, 4 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 ; LMULMAX1-NEXT: vmerge.vim v25, v25, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v26, v11 +; LMULMAX1-NEXT: vand.vi v26, v26, 1 +; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; LMULMAX1-NEXT: vmerge.vim v26, v27, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v28, v25, 4 +; LMULMAX1-NEXT: vslideup.vi v25, v26, 6 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v28, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v25, 0 ; LMULMAX1-NEXT: ret %z = fptoui <8 x double> %x to <8 x i1> ret <8 x i1> %z diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll index a1522ac3169eb..555f11a4e24e4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll @@ -466,15 +466,15 @@ define void @si2fp_v8i64_v8f16(<8 x i64>* %x, <8 x half>* %y) { ; ; LMULMAX1-LABEL: si2fp_v8i64_v8f16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -48 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 48 +; LMULMAX1-NEXT: addi sp, sp, -32 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vle64.v v25, (a0) -; LMULMAX1-NEXT: addi a2, a0, 32 +; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: vle64.v v26, (a2) ; LMULMAX1-NEXT: addi a2, a0, 48 ; LMULMAX1-NEXT: vle64.v v27, (a2) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a0, a0, 32 ; LMULMAX1-NEXT: vle64.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.f.x.w v29, v27 @@ -486,35 +486,25 @@ define void @si2fp_v8i64_v8f16(<8 x i64>* %x, <8 x half>* %y) { ; LMULMAX1-NEXT: vfncvt.f.x.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v28, v27 -; LMULMAX1-NEXT: addi a0, sp, 20 +; LMULMAX1-NEXT: addi a0, sp, 24 ; LMULMAX1-NEXT: vse16.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.f.x.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v26, v27 -; LMULMAX1-NEXT: addi a0, sp, 24 -; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 24 -; LMULMAX1-NEXT: vle16.v v26, (a0) -; LMULMAX1-NEXT: addi a0, sp, 40 +; LMULMAX1-NEXT: addi a0, sp, 20 ; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.f.x.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v25, v26 ; LMULMAX1-NEXT: addi a0, sp, 16 ; LMULMAX1-NEXT: vse16.v v25, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 16 -; LMULMAX1-NEXT: vle16.v v25, (a0) -; LMULMAX1-NEXT: addi a0, sp, 32 -; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 32 +; LMULMAX1-NEXT: addi a0, sp, 16 ; LMULMAX1-NEXT: vle16.v v25, (a0) ; LMULMAX1-NEXT: vse16.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 48 +; LMULMAX1-NEXT: addi sp, sp, 32 ; LMULMAX1-NEXT: ret %a = load <8 x i64>, <8 x i64>* %x %d = sitofp <8 x i64> %a to <8 x half> @@ -536,15 +526,15 @@ define void @ui2fp_v8i64_v8f16(<8 x i64>* %x, <8 x half>* %y) { ; ; LMULMAX1-LABEL: ui2fp_v8i64_v8f16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -48 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 48 +; LMULMAX1-NEXT: addi sp, sp, -32 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vle64.v v25, (a0) -; LMULMAX1-NEXT: addi a2, a0, 32 +; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: vle64.v v26, (a2) ; LMULMAX1-NEXT: addi a2, a0, 48 ; LMULMAX1-NEXT: vle64.v v27, (a2) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a0, a0, 32 ; LMULMAX1-NEXT: vle64.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.f.xu.w v29, v27 @@ -556,35 +546,25 @@ define void @ui2fp_v8i64_v8f16(<8 x i64>* %x, <8 x half>* %y) { ; LMULMAX1-NEXT: vfncvt.f.xu.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v28, v27 -; LMULMAX1-NEXT: addi a0, sp, 20 +; LMULMAX1-NEXT: addi a0, sp, 24 ; LMULMAX1-NEXT: vse16.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.f.xu.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v26, v27 -; LMULMAX1-NEXT: addi a0, sp, 24 -; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 24 -; LMULMAX1-NEXT: vle16.v v26, (a0) -; LMULMAX1-NEXT: addi a0, sp, 40 +; LMULMAX1-NEXT: addi a0, sp, 20 ; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.f.xu.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v25, v26 ; LMULMAX1-NEXT: addi a0, sp, 16 ; LMULMAX1-NEXT: vse16.v v25, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 16 -; LMULMAX1-NEXT: vle16.v v25, (a0) -; LMULMAX1-NEXT: addi a0, sp, 32 -; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 32 +; LMULMAX1-NEXT: addi a0, sp, 16 ; LMULMAX1-NEXT: vle16.v v25, (a0) ; LMULMAX1-NEXT: vse16.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 48 +; LMULMAX1-NEXT: addi sp, sp, 32 ; LMULMAX1-NEXT: ret %a = load <8 x i64>, <8 x i64>* %x %d = uitofp <8 x i64> %a to <8 x half> diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll index 82c09251b329c..86d696f0c2341 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -2267,33 +2267,31 @@ define @mgather_baseidx_nxv32i8(i8* %base, ; RV64-LABEL: mgather_baseidx_nxv32i8: ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v25, v0 +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsetvli zero, zero, e8, m1, tu, mu +; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: srli a2, a1, 2 -; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, mu -; RV64-NEXT: vslidedown.vx v26, v0, a2 -; RV64-NEXT: srli a1, a1, 3 -; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v26, a1 -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v11 +; RV64-NEXT: srli a2, a1, 3 +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vx v0, v0, a2 +; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v16, v9 ; RV64-NEXT: vsetvli zero, zero, e8, m1, tu, mu -; RV64-NEXT: vluxei64.v v15, (a0), v16, v0.t -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v13, (a0), v16, v0.t +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; RV64-NEXT: vslidedown.vx v0, v25, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV64-NEXT: vsext.vf8 v16, v10 ; RV64-NEXT: vsetvli zero, zero, e8, m1, tu, mu -; RV64-NEXT: vmv1r.v v0, v26 ; RV64-NEXT: vluxei64.v v14, (a0), v16, v0.t -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v8 -; RV64-NEXT: vsetvli zero, zero, e8, m1, tu, mu -; RV64-NEXT: vmv1r.v v0, v25 -; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t -; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v25, a1 +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vx v0, v0, a2 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v9 +; RV64-NEXT: vsext.vf8 v16, v11 ; RV64-NEXT: vsetvli zero, zero, e8, m1, tu, mu -; RV64-NEXT: vluxei64.v v13, (a0), v16, v0.t +; RV64-NEXT: vluxei64.v v15, (a0), v16, v0.t ; RV64-NEXT: vmv4r.v v8, v12 ; RV64-NEXT: ret %ptrs = getelementptr inbounds i8, i8* %base, %idxs diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll index a0818e96bf215..411f27f60b30c 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -8,17 +8,13 @@ define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_2f64_12u4: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 16(%rdi), %ymm0 -; ALL-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vmovups 16(%rdi), %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_2f64_12u4: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 16(%eax), %ymm0 -; X86-AVX512F-NEXT: vinsertf128 $1, 64(%eax), %ymm0, %ymm1 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vmovups 16(%eax), %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 1 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 @@ -35,19 +31,15 @@ define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable n define <8 x double> @merge_8f64_2f64_23z5(<2 x double>* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_2f64_23z5: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 32(%rdi), %ymm0 -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vinsertf128 $1, 80(%rdi), %ymm1, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vmovdqu64 32(%rdi), %zmm0 +; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_2f64_23z5: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 32(%eax), %ymm0 -; X86-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86-AVX512F-NEXT: vinsertf128 $1, 80(%eax), %ymm1, %ymm1 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vmovdqu64 32(%eax), %zmm0 +; X86-AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll index 05caca8eaae1b..0037cfcf47f4a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -575,9 +575,7 @@ define <16 x float> @insert_sub1_12(<16 x float> %base, <4 x float> %sub1, <4 x define <16 x float> @insert_sub2_4(<16 x float> %base, <4 x float> %sub1, <4 x float> %sub2, <4 x float> %sub3, <4 x float> %sub4) { ; ALL-LABEL: insert_sub2_4: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf32x4 $2, %xmm3, %zmm0, %zmm1 -; ALL-NEXT: vmovapd {{.*#+}} zmm2 = [0,1,12,13,4,5,6,7] -; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vinsertf32x4 $1, %xmm3, %zmm0, %zmm0 ; ALL-NEXT: retq %sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> %sub34 = shufflevector <4 x float> %sub3, <4 x float> %sub4, <8 x i32> diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 66f30754fd7af..258b54203438b 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -369,8 +369,8 @@ define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdi) ; AVX512-NEXT: vzeroupper @@ -1467,15 +1467,15 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){ ; AVX512-NEXT: vmovdqu 96(%rdi), %xmm3 ; AVX512-NEXT: vmovdqu 112(%rdi), %xmm4 ; AVX512-NEXT: vmovdqu 128(%rdi), %xmm5 +; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 -; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 -; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512-NEXT: vpshufb %zmm3, %zmm1, %zmm1 From 5033f0793fe6e8b36990e6ce9f9ec2bf2ff79923 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sun, 15 Aug 2021 17:59:32 +0200 Subject: [PATCH 119/700] [lldb] Avoid unhandled Error in TypeSystemMap::GetTypeSystemForLanguage When assertions are turned off, the `llvm::Error` value created at the start of this function is overwritten using the move-assignment operator, but the success value is never checked. Whenever a TypeSystem cannot be found or created, this can lead to lldb core dumping with: Program aborted due to an unhandled Error: Error value was Success. (Note: Success values must still be checked prior to being destroyed). Fix this by not creating a `llvm::Error` value in advance, and directly returning the result of `llvm::make_error` instead, whenever an error is encountered. See also: and . Reviewed By: teemperor Differential Revision: https://reviews.llvm.org/D108088 --- lldb/source/Symbol/TypeSystem.cpp | 90 +++++++++++++------------------ 1 file changed, 38 insertions(+), 52 deletions(-) diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp index 252b06e269d6b..0b3f7e4f3bd4e 100644 --- a/lldb/source/Symbol/TypeSystem.cpp +++ b/lldb/source/Symbol/TypeSystem.cpp @@ -223,62 +223,32 @@ void TypeSystemMap::ForEach(std::function const &callback) { llvm::Expected TypeSystemMap::GetTypeSystemForLanguage( lldb::LanguageType language, llvm::Optional create_callback) { - llvm::Error error = llvm::Error::success(); - assert(!error); // Check the success value when assertions are enabled std::lock_guard guard(m_mutex); - if (m_clear_in_progress) { - error = llvm::make_error( + if (m_clear_in_progress) + return llvm::make_error( "Unable to get TypeSystem because TypeSystemMap is being cleared", llvm::inconvertibleErrorCode()); - } else { - collection::iterator pos = m_map.find(language); - if (pos != m_map.end()) { - auto *type_system = pos->second.get(); - if (type_system) { - llvm::consumeError(std::move(error)); - return *type_system; - } - error = llvm::make_error( - "TypeSystem for language " + - llvm::StringRef(Language::GetNameForLanguageType(language)) + - " doesn't exist", - llvm::inconvertibleErrorCode()); - return std::move(error); - } - for (const auto &pair : m_map) { - if (pair.second && pair.second->SupportsLanguage(language)) { - // Add a new mapping for "language" to point to an already existing - // TypeSystem that supports this language - m_map[language] = pair.second; - if (pair.second.get()) { - llvm::consumeError(std::move(error)); - return *pair.second.get(); - } - error = llvm::make_error( - "TypeSystem for language " + - llvm::StringRef(Language::GetNameForLanguageType(language)) + - " doesn't exist", - llvm::inconvertibleErrorCode()); - return std::move(error); - } - } + collection::iterator pos = m_map.find(language); + if (pos != m_map.end()) { + auto *type_system = pos->second.get(); + if (type_system) + return *type_system; + return llvm::make_error( + "TypeSystem for language " + + llvm::StringRef(Language::GetNameForLanguageType(language)) + + " doesn't exist", + llvm::inconvertibleErrorCode()); + } - if (!create_callback) { - error = llvm::make_error( - "Unable to find type system for language " + - llvm::StringRef(Language::GetNameForLanguageType(language)), - llvm::inconvertibleErrorCode()); - } else { - // Cache even if we get a shared pointer that contains a null type system - // back - TypeSystemSP type_system_sp = (*create_callback)(); - m_map[language] = type_system_sp; - if (type_system_sp.get()) { - llvm::consumeError(std::move(error)); - return *type_system_sp.get(); - } - error = llvm::make_error( + for (const auto &pair : m_map) { + if (pair.second && pair.second->SupportsLanguage(language)) { + // Add a new mapping for "language" to point to an already existing + // TypeSystem that supports this language + m_map[language] = pair.second; + if (pair.second.get()) + return *pair.second.get(); + return llvm::make_error( "TypeSystem for language " + llvm::StringRef(Language::GetNameForLanguageType(language)) + " doesn't exist", @@ -286,7 +256,23 @@ llvm::Expected TypeSystemMap::GetTypeSystemForLanguage( } } - return std::move(error); + if (!create_callback) + return llvm::make_error( + "Unable to find type system for language " + + llvm::StringRef(Language::GetNameForLanguageType(language)), + llvm::inconvertibleErrorCode()); + + // Cache even if we get a shared pointer that contains a null type system + // back + TypeSystemSP type_system_sp = (*create_callback)(); + m_map[language] = type_system_sp; + if (type_system_sp.get()) + return *type_system_sp.get(); + return llvm::make_error( + "TypeSystem for language " + + llvm::StringRef(Language::GetNameForLanguageType(language)) + + " doesn't exist", + llvm::inconvertibleErrorCode()); } llvm::Expected From 5ed162c8f9a430564820dd393f7ca8a666a739e2 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 16 Aug 2021 11:42:30 -0400 Subject: [PATCH 120/700] [libc++][NFC] Replace uses of 'constexpr friend' by 'friend constexpr' This is done for consistency, since that's what we do everywhere else in the library. --- .../std/ranges/range.adaptors/range.take/types.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/libcxx/test/std/ranges/range.adaptors/range.take/types.h b/libcxx/test/std/ranges/range.adaptors/range.take/types.h index e260de2e38cf3..72563eecfc925 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.take/types.h +++ b/libcxx/test/std/ranges/range.adaptors/range.take/types.h @@ -34,12 +34,12 @@ using ForwardIter = forward_iterator; struct SizedForwardView : std::ranges::view_base { int *ptr_; constexpr SizedForwardView(int* ptr) : ptr_(ptr) {} - constexpr friend ForwardIter begin(SizedForwardView& view) { return ForwardIter(view.ptr_); } - constexpr friend ForwardIter begin(SizedForwardView const& view) { return ForwardIter(view.ptr_); } - constexpr friend sentinel_wrapper end(SizedForwardView& view) { + friend constexpr ForwardIter begin(SizedForwardView& view) { return ForwardIter(view.ptr_); } + friend constexpr ForwardIter begin(SizedForwardView const& view) { return ForwardIter(view.ptr_); } + friend constexpr sentinel_wrapper end(SizedForwardView& view) { return sentinel_wrapper{ForwardIter(view.ptr_ + 8)}; } - constexpr friend sentinel_wrapper end(SizedForwardView const& view) { + friend constexpr sentinel_wrapper end(SizedForwardView const& view) { return sentinel_wrapper{ForwardIter(view.ptr_ + 8)}; } }; @@ -55,12 +55,12 @@ using RandomAccessIter = random_access_iterator; struct SizedRandomAccessView : std::ranges::view_base { int *ptr_; constexpr SizedRandomAccessView(int* ptr) : ptr_(ptr) {} - constexpr friend RandomAccessIter begin(SizedRandomAccessView& view) { return RandomAccessIter(view.ptr_); } - constexpr friend RandomAccessIter begin(SizedRandomAccessView const& view) { return RandomAccessIter(view.ptr_); } - constexpr friend sentinel_wrapper end(SizedRandomAccessView& view) { + friend constexpr RandomAccessIter begin(SizedRandomAccessView& view) { return RandomAccessIter(view.ptr_); } + friend constexpr RandomAccessIter begin(SizedRandomAccessView const& view) { return RandomAccessIter(view.ptr_); } + friend constexpr sentinel_wrapper end(SizedRandomAccessView& view) { return sentinel_wrapper{RandomAccessIter(view.ptr_ + 8)}; } - constexpr friend sentinel_wrapper end(SizedRandomAccessView const& view) { + friend constexpr sentinel_wrapper end(SizedRandomAccessView const& view) { return sentinel_wrapper{RandomAccessIter(view.ptr_ + 8)}; } }; From 92abb1cf90ff65e729fca9d518150ad4c04d25e1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 16 Aug 2021 08:42:00 -0700 Subject: [PATCH 121/700] [TypePromotion] Don't mutate the result type of SwitchInst. SwitchInst should have a void result type. Add a check to the verifier to catch this error. Reviewed By: samparker Differential Revision: https://reviews.llvm.org/D108084 --- llvm/lib/CodeGen/TypePromotion.cpp | 4 ++-- llvm/lib/IR/Verifier.cpp | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp index 2ce6ea1d42120..1c9717e05eaf1 100644 --- a/llvm/lib/CodeGen/TypePromotion.cpp +++ b/llvm/lib/CodeGen/TypePromotion.cpp @@ -539,8 +539,8 @@ void IRPromoter::PromoteTree() { I->setOperand(i, UndefValue::get(ExtTy)); } - // Mutate the result type, unless this is an icmp. - if (!isa(I)) { + // Mutate the result type, unless this is an icmp or switch. + if (!isa(I) && !isa(I)) { I->mutateType(ExtTy); Promoted.insert(I); } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 5e93aa08c5af3..9392b4684d73f 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2691,6 +2691,7 @@ void Verifier::visitReturnInst(ReturnInst &RI) { } void Verifier::visitSwitchInst(SwitchInst &SI) { + Assert(SI.getType()->isVoidTy(), "Switch must have void result type!", &SI); // Check to make sure that all of the constants in the switch instruction // have the same type as the switched-on value. Type *SwitchTy = SI.getCondition()->getType(); From cf521e78dfd2a418970358e5d55dcc9a862664e5 Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Mon, 16 Aug 2021 17:39:10 +0200 Subject: [PATCH 122/700] [lldb] Add tests for TypeSystemMap::GetTypeSystemForLanguage Regression tests for D108088 . Reviewed By: mib Differential Revision: https://reviews.llvm.org/D108121 --- lldb/unittests/Symbol/CMakeLists.txt | 1 + lldb/unittests/Symbol/TestTypeSystem.cpp | 92 ++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 lldb/unittests/Symbol/TestTypeSystem.cpp diff --git a/lldb/unittests/Symbol/CMakeLists.txt b/lldb/unittests/Symbol/CMakeLists.txt index 76c7b645f277a..748faf33b556b 100644 --- a/lldb/unittests/Symbol/CMakeLists.txt +++ b/lldb/unittests/Symbol/CMakeLists.txt @@ -1,6 +1,7 @@ add_lldb_unittest(SymbolTests LocateSymbolFileTest.cpp PostfixExpressionTest.cpp + TestTypeSystem.cpp TestTypeSystemClang.cpp TestClangASTImporter.cpp TestDWARFCallFrameInfo.cpp diff --git a/lldb/unittests/Symbol/TestTypeSystem.cpp b/lldb/unittests/Symbol/TestTypeSystem.cpp new file mode 100644 index 0000000000000..59297a7475d27 --- /dev/null +++ b/lldb/unittests/Symbol/TestTypeSystem.cpp @@ -0,0 +1,92 @@ +//===-- TestTypeSystem.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TestingSupport/SubsystemRAII.h" +#include "lldb/Core/Module.h" +#include "lldb/Host/FileSystem.h" +#include "lldb/Host/HostInfo.h" +#include "lldb/Symbol/TypeSystem.h" +#include "gtest/gtest.h" + +using namespace lldb; +using namespace lldb_private; + +class TestTypeSystemMap : public testing::Test { +public: + SubsystemRAII subsystems; +}; + +TEST_F(TestTypeSystemMap, GetTypeSystemForLanguageWithInvalidModule) { + // GetTypeSystemForLanguage called with an invalid Module. + TypeSystemMap map; + Module module{ModuleSpec()}; + EXPECT_THAT_EXPECTED( + map.GetTypeSystemForLanguage(eLanguageTypeUnknown, &module, + /*can_create=*/true), + llvm::FailedWithMessage("TypeSystem for language unknown doesn't exist")); + + EXPECT_THAT_EXPECTED( + map.GetTypeSystemForLanguage(eLanguageTypeUnknown, &module, + /*can_create=*/false), + llvm::FailedWithMessage("TypeSystem for language unknown doesn't exist")); + + EXPECT_THAT_EXPECTED( + map.GetTypeSystemForLanguage(eLanguageTypeC, &module, + /*can_create=*/true), + llvm::FailedWithMessage("TypeSystem for language c doesn't exist")); + EXPECT_THAT_EXPECTED( + map.GetTypeSystemForLanguage(eLanguageTypeC, &module, + /*can_create=*/false), + llvm::FailedWithMessage("TypeSystem for language c doesn't exist")); +} + +TEST_F(TestTypeSystemMap, GetTypeSystemForLanguageWithNoModule) { + // GetTypeSystemForLanguage called with no Module. + TypeSystemMap map; + Module *module = nullptr; + EXPECT_THAT_EXPECTED( + map.GetTypeSystemForLanguage(eLanguageTypeUnknown, module, + /*can_create=*/true), + llvm::FailedWithMessage("TypeSystem for language unknown doesn't exist")); + + EXPECT_THAT_EXPECTED( + map.GetTypeSystemForLanguage(eLanguageTypeUnknown, module, + /*can_create=*/false), + llvm::FailedWithMessage("TypeSystem for language unknown doesn't exist")); + + EXPECT_THAT_EXPECTED( + map.GetTypeSystemForLanguage(eLanguageTypeC, module, /*can_create=*/true), + llvm::FailedWithMessage("TypeSystem for language c doesn't exist")); + EXPECT_THAT_EXPECTED( + map.GetTypeSystemForLanguage(eLanguageTypeC, module, + /*can_create=*/false), + llvm::FailedWithMessage("TypeSystem for language c doesn't exist")); +} + +TEST_F(TestTypeSystemMap, GetTypeSystemForLanguageWithNoTarget) { + // GetTypeSystemForLanguage called with no Target. + TypeSystemMap map; + Target *target = nullptr; + EXPECT_THAT_EXPECTED( + map.GetTypeSystemForLanguage(eLanguageTypeUnknown, target, + /*can_create=*/true), + llvm::FailedWithMessage("TypeSystem for language unknown doesn't exist")); + + EXPECT_THAT_EXPECTED( + map.GetTypeSystemForLanguage(eLanguageTypeUnknown, target, + /*can_create=*/false), + llvm::FailedWithMessage("TypeSystem for language unknown doesn't exist")); + + EXPECT_THAT_EXPECTED( + map.GetTypeSystemForLanguage(eLanguageTypeC, target, /*can_create=*/true), + llvm::FailedWithMessage("TypeSystem for language c doesn't exist")); + EXPECT_THAT_EXPECTED( + map.GetTypeSystemForLanguage(eLanguageTypeC, target, + /*can_create=*/false), + llvm::FailedWithMessage("TypeSystem for language c doesn't exist")); +} From 94b4598d77fe0585a8a3bd2a798fc7ce15a6aa56 Mon Sep 17 00:00:00 2001 From: Paul Robinson Date: Mon, 16 Aug 2021 08:42:21 -0700 Subject: [PATCH 123/700] [PS4] stp[n]cpy not available on PS4 --- llvm/lib/Analysis/TargetLibraryInfo.cpp | 5 +++++ llvm/test/Transforms/InstCombine/sprintf-1.ll | 2 ++ 2 files changed, 7 insertions(+) diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index 1e377df2a3f7d..0a2031de5b89a 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -595,6 +595,11 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_stpncpy); } + if (T.isPS4()) { + TLI.setUnavailable(LibFunc_stpcpy); + TLI.setUnavailable(LibFunc_stpncpy); + } + // As currently implemented in clang, NVPTX code has no standard library to // speak of. Headers provide a standard-ish library implementation, but many // of the signatures are wrong -- for example, many libm functions are not diff --git a/llvm/test/Transforms/InstCombine/sprintf-1.ll b/llvm/test/Transforms/InstCombine/sprintf-1.ll index ac9cb5857e050..58cab2130f6e7 100644 --- a/llvm/test/Transforms/InstCombine/sprintf-1.ll +++ b/llvm/test/Transforms/InstCombine/sprintf-1.ll @@ -7,6 +7,8 @@ ; RUN: opt < %s -mtriple=i386-mingw32 -instcombine -S | FileCheck %s --check-prefixes=CHECK,WIN,NOSTPCPY ; RUN: opt < %s -mtriple=armv7-none-linux-android16 -instcombine -S | FileCheck %s --check-prefixes=CHECK,NOSTPCPY ; RUN: opt < %s -mtriple=armv7-none-linux-android21 -instcombine -S | FileCheck %s --check-prefixes=CHECK,WITHSTPCPY +; RUN: opt < %s -mtriple=x86_64-scei-ps4 -instcombine -S | FileCheck %s --check-prefixes=CHECK,NOSTPCPY + target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" From 4cfb047d6ab3259c20059e8ed4c15ece3d6eb723 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sun, 15 Aug 2021 23:43:34 +0200 Subject: [PATCH 124/700] [profile] Don't use pragma comment linker on mingw At least when compiling with gcc, this is not supported and will result in errors when linking against the profiler runtime. Only use the pragma comment linker based code with MSVC, but not with a mingw toolchain. This also undoes D107620, which shouldn't be relevant anymore. Differential Revision: https://reviews.llvm.org/D108095 --- compiler-rt/lib/profile/CMakeLists.txt | 3 --- compiler-rt/lib/profile/InstrProfilingFile.c | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt index 9688236c52ea8..f5e13574b7ce8 100644 --- a/compiler-rt/lib/profile/CMakeLists.txt +++ b/compiler-rt/lib/profile/CMakeLists.txt @@ -114,9 +114,6 @@ endif() append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS) # XRay uses C++ standard library headers. string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -# The Windows specific code uses a #pragma comment(linker, ...) which requires -# -fms-extensions on MinGW targets. -append_list_if(MINGW -fms-extensions EXTRA_FLAGS) # This appears to be a C-only warning banning the use of locals in aggregate # initializers. All other compilers accept this, though. diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index b99db321b4e60..fb4c2fecefacb 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -109,7 +109,7 @@ intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR = 0; /* This variable is a weak external reference which could be used to detect * whether or not the compiler defined this symbol. */ -#if defined(_WIN32) +#if defined(_MSC_VER) COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_VAR; #if defined(_M_IX86) || defined(__i386__) #define WIN_SYM_PREFIX "_" From f62d0d48ea0297424d4fb7a241caaeb7be41d2d4 Mon Sep 17 00:00:00 2001 From: Paul Robinson Date: Mon, 16 Aug 2021 09:27:48 -0700 Subject: [PATCH 125/700] Fix whitespace typo in 94b4598 --- llvm/test/Transforms/InstCombine/sprintf-1.ll | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/test/Transforms/InstCombine/sprintf-1.ll b/llvm/test/Transforms/InstCombine/sprintf-1.ll index 58cab2130f6e7..dc11eec52c95f 100644 --- a/llvm/test/Transforms/InstCombine/sprintf-1.ll +++ b/llvm/test/Transforms/InstCombine/sprintf-1.ll @@ -9,7 +9,6 @@ ; RUN: opt < %s -mtriple=armv7-none-linux-android21 -instcombine -S | FileCheck %s --check-prefixes=CHECK,WITHSTPCPY ; RUN: opt < %s -mtriple=x86_64-scei-ps4 -instcombine -S | FileCheck %s --check-prefixes=CHECK,NOSTPCPY - target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" @hello_world = constant [13 x i8] c"hello world\0A\00" From 778440f1996f2a442fb691058b97bbb0a9cf7300 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 16 Aug 2021 17:25:34 +0100 Subject: [PATCH 126/700] [X86] Add i128 funnel shift tests Test coverage for D108058 --- llvm/test/CodeGen/X86/fshl.ll | 596 +++++++++++++++++++++++++++++++++ llvm/test/CodeGen/X86/fshr.ll | 605 ++++++++++++++++++++++++++++++++++ 2 files changed, 1201 insertions(+) diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll index 8b8312dde3d1d..19c521d05bd62 100644 --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -8,6 +8,7 @@ declare i8 @llvm.fshl.i8(i8, i8, i8) nounwind readnone declare i16 @llvm.fshl.i16(i16, i16, i16) nounwind readnone declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone declare i64 @llvm.fshl.i64(i64, i64, i64) nounwind readnone +declare i128 @llvm.fshl.i128(i128, i128, i128) nounwind readnone ; ; Variable Funnel Shift @@ -299,6 +300,601 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ret i64 %tmp } +define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { +; X86-FAST-LABEL: var_shift_i128: +; X86-FAST: # %bb.0: +; X86-FAST-NEXT: pushl %ebp +; X86-FAST-NEXT: pushl %ebx +; X86-FAST-NEXT: pushl %edi +; X86-FAST-NEXT: pushl %esi +; X86-FAST-NEXT: subl $72, %esp +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: movl %edx, %edi +; X86-FAST-NEXT: shldl $31, %eax, %edi +; X86-FAST-NEXT: movl %ebx, %eax +; X86-FAST-NEXT: notl %ebx +; X86-FAST-NEXT: andl $127, %ebx +; X86-FAST-NEXT: movb $64, %cl +; X86-FAST-NEXT: subb %bl, %cl +; X86-FAST-NEXT: shrl %edx +; X86-FAST-NEXT: movl %edx, %ebp +; X86-FAST-NEXT: shldl %cl, %edi, %edx +; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %edi, %edx +; X86-FAST-NEXT: shll %cl, %edx +; X86-FAST-NEXT: testb $32, %cl +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-FAST-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: jne .LBB6_1 +; X86-FAST-NEXT: # %bb.2: +; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: jmp .LBB6_3 +; X86-FAST-NEXT: .LBB6_1: +; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-FAST-NEXT: .LBB6_3: +; X86-FAST-NEXT: andl $127, %eax +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movb %al, %ch +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movb %ch, %cl +; X86-FAST-NEXT: shldl %cl, %esi, %eax +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movb %bl, %cl +; X86-FAST-NEXT: addb $-64, %cl +; X86-FAST-NEXT: movl %edi, %eax +; X86-FAST-NEXT: movl %ebp, %edx +; X86-FAST-NEXT: shrdl %cl, %ebp, %eax +; X86-FAST-NEXT: shrl %cl, %ebp +; X86-FAST-NEXT: testb $32, %cl +; X86-FAST-NEXT: jne .LBB6_4 +; X86-FAST-NEXT: # %bb.5: +; X86-FAST-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: jmp .LBB6_6 +; X86-FAST-NEXT: .LBB6_4: +; X86-FAST-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-FAST-NEXT: .LBB6_6: +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-FAST-NEXT: movb %ch, %cl +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: shldl %cl, %eax, %ebp +; X86-FAST-NEXT: shll %cl, %eax +; X86-FAST-NEXT: shll %cl, %esi +; X86-FAST-NEXT: testb $32, %ch +; X86-FAST-NEXT: jne .LBB6_7 +; X86-FAST-NEXT: # %bb.8: +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: jmp .LBB6_9 +; X86-FAST-NEXT: .LBB6_7: +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %eax, %ebp +; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-FAST-NEXT: .LBB6_9: +; X86-FAST-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: jb .LBB6_11 +; X86-FAST-NEXT: # %bb.10: +; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-FAST-NEXT: .LBB6_11: +; X86-FAST-NEXT: movb %bl, %cl +; X86-FAST-NEXT: shrdl %cl, %edx, %edi +; X86-FAST-NEXT: shrl %cl, %edx +; X86-FAST-NEXT: shldl $31, %eax, %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-FAST-NEXT: shrdl $1, %ebp, %eax +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: shrdl %cl, %esi, %eax +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %esi, %eax +; X86-FAST-NEXT: shrl %cl, %eax +; X86-FAST-NEXT: testb $32, %bl +; X86-FAST-NEXT: je .LBB6_13 +; X86-FAST-NEXT: # %bb.12: +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %edx, %edi +; X86-FAST-NEXT: xorl %eax, %eax +; X86-FAST-NEXT: xorl %edx, %edx +; X86-FAST-NEXT: .LBB6_13: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-FAST-NEXT: jb .LBB6_15 +; X86-FAST-NEXT: # %bb.14: +; X86-FAST-NEXT: xorl %ebp, %ebp +; X86-FAST-NEXT: .LBB6_15: +; X86-FAST-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movb $64, %cl +; X86-FAST-NEXT: subb %ch, %cl +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-FAST-NEXT: shrl %cl, %ebp +; X86-FAST-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-FAST-NEXT: testb $32, %cl +; X86-FAST-NEXT: movl $0, %edx +; X86-FAST-NEXT: jne .LBB6_17 +; X86-FAST-NEXT: # %bb.16: +; X86-FAST-NEXT: movl %ebp, %edx +; X86-FAST-NEXT: .LBB6_17: +; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: addb $-64, %ch +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-FAST-NEXT: movl %edi, %esi +; X86-FAST-NEXT: movb %ch, %cl +; X86-FAST-NEXT: shll %cl, %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: shldl %cl, %edi, %edx +; X86-FAST-NEXT: testb $32, %ch +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: jne .LBB6_19 +; X86-FAST-NEXT: # %bb.18: +; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: .LBB6_19: +; X86-FAST-NEXT: cmpl $64, %ebx +; X86-FAST-NEXT: jb .LBB6_21 +; X86-FAST-NEXT: # %bb.20: +; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-FAST-NEXT: .LBB6_21: +; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-FAST-NEXT: jae .LBB6_23 +; X86-FAST-NEXT: # %bb.22: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-FAST-NEXT: .LBB6_23: +; X86-FAST-NEXT: testb $32, %ch +; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-FAST-NEXT: jne .LBB6_25 +; X86-FAST-NEXT: # %bb.24: +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: .LBB6_25: +; X86-FAST-NEXT: cmpl $64, %ebx +; X86-FAST-NEXT: jb .LBB6_27 +; X86-FAST-NEXT: # %bb.26: +; X86-FAST-NEXT: xorl %edx, %edx +; X86-FAST-NEXT: .LBB6_27: +; X86-FAST-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: shrdl %cl, %esi, %edi +; X86-FAST-NEXT: testb $32, %cl +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-FAST-NEXT: jne .LBB6_29 +; X86-FAST-NEXT: # %bb.28: +; X86-FAST-NEXT: movl %edi, %ebp +; X86-FAST-NEXT: .LBB6_29: +; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-FAST-NEXT: jae .LBB6_31 +; X86-FAST-NEXT: # %bb.30: +; X86-FAST-NEXT: orl %ebp, %esi +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: .LBB6_31: +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-FAST-NEXT: cmpl $64, %ebx +; X86-FAST-NEXT: jae .LBB6_33 +; X86-FAST-NEXT: # %bb.32: +; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-FAST-NEXT: movl %eax, %ebp +; X86-FAST-NEXT: .LBB6_33: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-FAST-NEXT: cmpl $64, %ebx +; X86-FAST-NEXT: jae .LBB6_35 +; X86-FAST-NEXT: # %bb.34: +; X86-FAST-NEXT: movl %edx, %ecx +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-FAST-NEXT: orl %eax, %edx +; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %ecx, %edx +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-FAST-NEXT: .LBB6_35: +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: testl %ebx, %ebx +; X86-FAST-NEXT: je .LBB6_37 +; X86-FAST-NEXT: # %bb.36: +; X86-FAST-NEXT: movl %ebp, %ecx +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-FAST-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: .LBB6_37: +; X86-FAST-NEXT: orl %ecx, %edi +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-FAST-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-FAST-NEXT: je .LBB6_39 +; X86-FAST-NEXT: # %bb.38: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-FAST-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-FAST-NEXT: .LBB6_39: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-FAST-NEXT: orl %edx, %esi +; X86-FAST-NEXT: movl %ecx, 12(%eax) +; X86-FAST-NEXT: movl %esi, 8(%eax) +; X86-FAST-NEXT: movl %edi, 4(%eax) +; X86-FAST-NEXT: movl %ebx, (%eax) +; X86-FAST-NEXT: addl $72, %esp +; X86-FAST-NEXT: popl %esi +; X86-FAST-NEXT: popl %edi +; X86-FAST-NEXT: popl %ebx +; X86-FAST-NEXT: popl %ebp +; X86-FAST-NEXT: retl $4 +; +; X86-SLOW-LABEL: var_shift_i128: +; X86-SLOW: # %bb.0: +; X86-SLOW-NEXT: pushl %ebp +; X86-SLOW-NEXT: pushl %ebx +; X86-SLOW-NEXT: pushl %edi +; X86-SLOW-NEXT: pushl %esi +; X86-SLOW-NEXT: subl $76, %esp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: andl $127, %eax +; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: # kill: def $al killed $al killed $eax +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: movl %ebx, %esi +; X86-SLOW-NEXT: shrl %esi +; X86-SLOW-NEXT: movb %al, %ah +; X86-SLOW-NEXT: notb %ah +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movb %ah, %cl +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: shll %cl, %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: shrl %edi +; X86-SLOW-NEXT: movb %ah, %cl +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: movl %ebx, %esi +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: shll %cl, %ebx +; X86-SLOW-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SLOW-NEXT: testb $32, %al +; X86-SLOW-NEXT: jne .LBB6_1 +; X86-SLOW-NEXT: # %bb.2: +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: orl (%esp), %edx # 4-byte Folded Reload +; X86-SLOW-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: orl %edi, %ebp +; X86-SLOW-NEXT: jmp .LBB6_3 +; X86-SLOW-NEXT: .LBB6_1: +; X86-SLOW-NEXT: movl %ebx, %ebp +; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: xorl %ebx, %ebx +; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: .LBB6_3: +; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload +; X86-SLOW-NEXT: jb .LBB6_5 +; X86-SLOW-NEXT: # %bb.4: +; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: .LBB6_5: +; X86-SLOW-NEXT: shrl %edi +; X86-SLOW-NEXT: notl %ebx +; X86-SLOW-NEXT: andl $127, %ebx +; X86-SLOW-NEXT: movl %edi, %ebp +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shrl %cl, %ebp +; X86-SLOW-NEXT: movl %esi, %ecx +; X86-SLOW-NEXT: shrl %ecx +; X86-SLOW-NEXT: movl %eax, %esi +; X86-SLOW-NEXT: shll $31, %esi +; X86-SLOW-NEXT: orl %ecx, %esi +; X86-SLOW-NEXT: movl %esi, %ecx +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: testb $32, %bl +; X86-SLOW-NEXT: movl $0, %esi +; X86-SLOW-NEXT: movl $0, %ecx +; X86-SLOW-NEXT: jne .LBB6_7 +; X86-SLOW-NEXT: # %bb.6: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SLOW-NEXT: movl %ebp, %ecx +; X86-SLOW-NEXT: .LBB6_7: +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: shrl %eax +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: shll $31, %esi +; X86-SLOW-NEXT: orl %eax, %esi +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: addl %edi, %edi +; X86-SLOW-NEXT: notb %cl +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: testb $32, %bl +; X86-SLOW-NEXT: jne .LBB6_9 +; X86-SLOW-NEXT: # %bb.8: +; X86-SLOW-NEXT: orl %esi, %edi +; X86-SLOW-NEXT: movl %edi, %ebp +; X86-SLOW-NEXT: .LBB6_9: +; X86-SLOW-NEXT: movb %bl, %dh +; X86-SLOW-NEXT: addb $-64, %dh +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SLOW-NEXT: movb %dh, %cl +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: testb $32, %dh +; X86-SLOW-NEXT: movl $0, %ecx +; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-SLOW-NEXT: jne .LBB6_11 +; X86-SLOW-NEXT: # %bb.10: +; X86-SLOW-NEXT: movl %esi, %ecx +; X86-SLOW-NEXT: .LBB6_11: +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: jb .LBB6_13 +; X86-SLOW-NEXT: # %bb.12: +; X86-SLOW-NEXT: xorl %eax, %eax +; X86-SLOW-NEXT: .LBB6_13: +; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movb $64, %ch +; X86-SLOW-NEXT: movb $64, %ah +; X86-SLOW-NEXT: subb %dl, %ah +; X86-SLOW-NEXT: movb %ah, %cl +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: notb %cl +; X86-SLOW-NEXT: leal (%ebp,%ebp), %edi +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: movb %ah, %cl +; X86-SLOW-NEXT: shrl %cl, %ebp +; X86-SLOW-NEXT: testb $32, %ah +; X86-SLOW-NEXT: jne .LBB6_14 +; X86-SLOW-NEXT: # %bb.15: +; X86-SLOW-NEXT: orl %esi, %edi +; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %edi, %ebp +; X86-SLOW-NEXT: jmp .LBB6_16 +; X86-SLOW-NEXT: .LBB6_14: +; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: .LBB6_16: +; X86-SLOW-NEXT: addb $-64, %dl +; X86-SLOW-NEXT: movb %dl, %cl +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: notb %cl +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: movb %dl, %cl +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: testb $32, %dl +; X86-SLOW-NEXT: jne .LBB6_17 +; X86-SLOW-NEXT: # %bb.18: +; X86-SLOW-NEXT: orl %eax, %edi +; X86-SLOW-NEXT: cmpl $64, %ebx +; X86-SLOW-NEXT: jae .LBB6_20 +; X86-SLOW-NEXT: jmp .LBB6_21 +; X86-SLOW-NEXT: .LBB6_17: +; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: xorl %esi, %esi +; X86-SLOW-NEXT: cmpl $64, %ebx +; X86-SLOW-NEXT: jb .LBB6_21 +; X86-SLOW-NEXT: .LBB6_20: +; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: .LBB6_21: +; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: jae .LBB6_23 +; X86-SLOW-NEXT: # %bb.22: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SLOW-NEXT: orl %ebp, %esi +; X86-SLOW-NEXT: .LBB6_23: +; X86-SLOW-NEXT: movl (%esp), %ebp # 4-byte Reload +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: jae .LBB6_25 +; X86-SLOW-NEXT: # %bb.24: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: .LBB6_25: +; X86-SLOW-NEXT: shrl %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: shll $31, %esi +; X86-SLOW-NEXT: orl %edi, %esi +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movb %bl, %cl +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-SLOW-NEXT: addl %edi, %edi +; X86-SLOW-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: testb $32, %bl +; X86-SLOW-NEXT: jne .LBB6_27 +; X86-SLOW-NEXT: # %bb.26: +; X86-SLOW-NEXT: orl %esi, %edi +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: .LBB6_27: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-SLOW-NEXT: movl %edi, %eax +; X86-SLOW-NEXT: movb %dh, %cl +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: notb %cl +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: testb $32, %dh +; X86-SLOW-NEXT: jne .LBB6_29 +; X86-SLOW-NEXT: # %bb.28: +; X86-SLOW-NEXT: orl %eax, %esi +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: .LBB6_29: +; X86-SLOW-NEXT: subb %bl, %ch +; X86-SLOW-NEXT: movl %edi, %eax +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: shrl %edi +; X86-SLOW-NEXT: notb %cl +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SLOW-NEXT: shll %cl, %edx +; X86-SLOW-NEXT: testb $32, %ch +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl %edi, %ecx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: jne .LBB6_30 +; X86-SLOW-NEXT: # %bb.31: +; X86-SLOW-NEXT: orl %ecx, %edx +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: cmpl $64, %ebx +; X86-SLOW-NEXT: jb .LBB6_33 +; X86-SLOW-NEXT: jmp .LBB6_34 +; X86-SLOW-NEXT: .LBB6_30: +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: xorl %eax, %eax +; X86-SLOW-NEXT: cmpl $64, %ebx +; X86-SLOW-NEXT: jae .LBB6_34 +; X86-SLOW-NEXT: .LBB6_33: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SLOW-NEXT: orl %eax, %edx +; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: .LBB6_34: +; X86-SLOW-NEXT: cmpl $64, %ebx +; X86-SLOW-NEXT: jb .LBB6_35 +; X86-SLOW-NEXT: # %bb.36: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SLOW-NEXT: jmp .LBB6_37 +; X86-SLOW-NEXT: .LBB6_35: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SLOW-NEXT: orl %ecx, %eax +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: .LBB6_37: +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: testl %ebx, %ebx +; X86-SLOW-NEXT: je .LBB6_39 +; X86-SLOW-NEXT: # %bb.38: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ecx, %ebx +; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: .LBB6_39: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-SLOW-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-SLOW-NEXT: je .LBB6_41 +; X86-SLOW-NEXT: # %bb.40: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-SLOW-NEXT: .LBB6_41: +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-SLOW-NEXT: movl %esi, 12(%eax) +; X86-SLOW-NEXT: movl %edi, 8(%eax) +; X86-SLOW-NEXT: movl %ebx, (%eax) +; X86-SLOW-NEXT: movl %ebp, 4(%eax) +; X86-SLOW-NEXT: addl $76, %esp +; X86-SLOW-NEXT: popl %esi +; X86-SLOW-NEXT: popl %edi +; X86-SLOW-NEXT: popl %ebx +; X86-SLOW-NEXT: popl %ebp +; X86-SLOW-NEXT: retl $4 +; +; X64-FAST-LABEL: var_shift_i128: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: movq %r8, %r9 +; X64-FAST-NEXT: movq %rcx, %r10 +; X64-FAST-NEXT: movq %rdx, %r8 +; X64-FAST-NEXT: movq %rsi, %rdx +; X64-FAST-NEXT: movl %r9d, %ecx +; X64-FAST-NEXT: shldq %cl, %rdi, %rdx +; X64-FAST-NEXT: shrdq $1, %r10, %r8 +; X64-FAST-NEXT: shrq %r10 +; X64-FAST-NEXT: notb %cl +; X64-FAST-NEXT: shrdq %cl, %r10, %r8 +; X64-FAST-NEXT: shrq %cl, %r10 +; X64-FAST-NEXT: xorl %eax, %eax +; X64-FAST-NEXT: testb $64, %cl +; X64-FAST-NEXT: cmovneq %r10, %r8 +; X64-FAST-NEXT: cmovneq %rax, %r10 +; X64-FAST-NEXT: movl %r9d, %ecx +; X64-FAST-NEXT: shlq %cl, %rdi +; X64-FAST-NEXT: testb $64, %r9b +; X64-FAST-NEXT: cmovneq %rdi, %rdx +; X64-FAST-NEXT: cmoveq %rdi, %rax +; X64-FAST-NEXT: orq %r8, %rax +; X64-FAST-NEXT: orq %r10, %rdx +; X64-FAST-NEXT: retq +; +; X64-SLOW-LABEL: var_shift_i128: +; X64-SLOW: # %bb.0: +; X64-SLOW-NEXT: movq %rcx, %r11 +; X64-SLOW-NEXT: movq %rdx, %r9 +; X64-SLOW-NEXT: movl %r8d, %ecx +; X64-SLOW-NEXT: shlq %cl, %rsi +; X64-SLOW-NEXT: movq %rdi, %rdx +; X64-SLOW-NEXT: shrq %rdx +; X64-SLOW-NEXT: movl %r8d, %r10d +; X64-SLOW-NEXT: notb %r10b +; X64-SLOW-NEXT: movl %r10d, %ecx +; X64-SLOW-NEXT: shrq %cl, %rdx +; X64-SLOW-NEXT: orq %rsi, %rdx +; X64-SLOW-NEXT: shrq %r9 +; X64-SLOW-NEXT: movq %r11, %rax +; X64-SLOW-NEXT: shlq $63, %rax +; X64-SLOW-NEXT: orq %r9, %rax +; X64-SLOW-NEXT: shrq %cl, %rax +; X64-SLOW-NEXT: shrq %r11 +; X64-SLOW-NEXT: leaq (%r11,%r11), %rsi +; X64-SLOW-NEXT: movl %r8d, %ecx +; X64-SLOW-NEXT: shlq %cl, %rsi +; X64-SLOW-NEXT: orq %rax, %rsi +; X64-SLOW-NEXT: movl %r10d, %ecx +; X64-SLOW-NEXT: shrq %cl, %r11 +; X64-SLOW-NEXT: xorl %eax, %eax +; X64-SLOW-NEXT: testb $64, %r10b +; X64-SLOW-NEXT: cmovneq %r11, %rsi +; X64-SLOW-NEXT: cmovneq %rax, %r11 +; X64-SLOW-NEXT: movl %r8d, %ecx +; X64-SLOW-NEXT: shlq %cl, %rdi +; X64-SLOW-NEXT: testb $64, %r8b +; X64-SLOW-NEXT: cmovneq %rdi, %rdx +; X64-SLOW-NEXT: cmoveq %rdi, %rax +; X64-SLOW-NEXT: orq %rsi, %rax +; X64-SLOW-NEXT: orq %r11, %rdx +; X64-SLOW-NEXT: retq + %tmp = tail call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z) + ret i128 %tmp +} + ; ; Const Funnel Shift ; diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll index 5decfb6f35e61..243c02328ead0 100644 --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -8,6 +8,7 @@ declare i8 @llvm.fshr.i8(i8, i8, i8) nounwind readnone declare i16 @llvm.fshr.i16(i16, i16, i16) nounwind readnone declare i32 @llvm.fshr.i32(i32, i32, i32) nounwind readnone declare i64 @llvm.fshr.i64(i64, i64, i64) nounwind readnone +declare i128 @llvm.fshr.i128(i128, i128, i128) nounwind readnone ; ; Variable Funnel Shift @@ -299,6 +300,610 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind { ret i64 %tmp } +define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { +; X86-FAST-LABEL: var_shift_i128: +; X86-FAST: # %bb.0: +; X86-FAST-NEXT: pushl %ebp +; X86-FAST-NEXT: pushl %ebx +; X86-FAST-NEXT: pushl %edi +; X86-FAST-NEXT: pushl %esi +; X86-FAST-NEXT: subl $76, %esp +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-FAST-NEXT: movl %ebx, %ecx +; X86-FAST-NEXT: andl $127, %ecx +; X86-FAST-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movb %cl, %ch +; X86-FAST-NEXT: movb $64, %cl +; X86-FAST-NEXT: subb %ch, %cl +; X86-FAST-NEXT: shll %cl, %edi +; X86-FAST-NEXT: movb %cl, (%esp) # 1-byte Spill +; X86-FAST-NEXT: testb $32, %cl +; X86-FAST-NEXT: movl $0, %esi +; X86-FAST-NEXT: jne .LBB6_2 +; X86-FAST-NEXT: # %bb.1: +; X86-FAST-NEXT: movl %edi, %esi +; X86-FAST-NEXT: .LBB6_2: +; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %eax, %edi +; X86-FAST-NEXT: movl %ebp, %eax +; X86-FAST-NEXT: shldl $1, %ebp, %edi +; X86-FAST-NEXT: addl %ebp, %eax +; X86-FAST-NEXT: notl %ebx +; X86-FAST-NEXT: andl $127, %ebx +; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movb %bl, %cl +; X86-FAST-NEXT: shldl %cl, %eax, %edi +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: shll %cl, %eax +; X86-FAST-NEXT: testb $32, %bl +; X86-FAST-NEXT: movl %eax, %esi +; X86-FAST-NEXT: jne .LBB6_4 +; X86-FAST-NEXT: # %bb.3: +; X86-FAST-NEXT: movl %edi, %esi +; X86-FAST-NEXT: .LBB6_4: +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-FAST-NEXT: movb %ch, %cl +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-FAST-NEXT: shrdl %cl, %edi, %esi +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %edi, %esi +; X86-FAST-NEXT: shrl %cl, %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-FAST-NEXT: shrl %cl, %edi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: shrdl %cl, %edx, %ebp +; X86-FAST-NEXT: testb $32, %ch +; X86-FAST-NEXT: jne .LBB6_5 +; X86-FAST-NEXT: # %bb.6: +; X86-FAST-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: jmp .LBB6_7 +; X86-FAST-NEXT: .LBB6_5: +; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: xorl %edi, %edi +; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-FAST-NEXT: .LBB6_7: +; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: testb $32, %bl +; X86-FAST-NEXT: movl $0, %esi +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-FAST-NEXT: jne .LBB6_9 +; X86-FAST-NEXT: # %bb.8: +; X86-FAST-NEXT: movl %eax, %esi +; X86-FAST-NEXT: .LBB6_9: +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-FAST-NEXT: jb .LBB6_11 +; X86-FAST-NEXT: # %bb.10: +; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-FAST-NEXT: .LBB6_11: +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: shrdl $31, %edi, %eax +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movb %bl, %cl +; X86-FAST-NEXT: shll %cl, %eax +; X86-FAST-NEXT: testb $32, %bl +; X86-FAST-NEXT: movl $0, %edi +; X86-FAST-NEXT: jne .LBB6_13 +; X86-FAST-NEXT: # %bb.12: +; X86-FAST-NEXT: movl %eax, %edi +; X86-FAST-NEXT: .LBB6_13: +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movb (%esp), %cl # 1-byte Reload +; X86-FAST-NEXT: shldl %cl, %ebp, %eax +; X86-FAST-NEXT: testb $32, %cl +; X86-FAST-NEXT: jne .LBB6_15 +; X86-FAST-NEXT: # %bb.14: +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: .LBB6_15: +; X86-FAST-NEXT: movb %bl, %dh +; X86-FAST-NEXT: addb $-64, %dh +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-FAST-NEXT: movb %dh, %cl +; X86-FAST-NEXT: shll %cl, %eax +; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-FAST-NEXT: testb $32, %dh +; X86-FAST-NEXT: movl $0, %eax +; X86-FAST-NEXT: jne .LBB6_17 +; X86-FAST-NEXT: # %bb.16: +; X86-FAST-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-FAST-NEXT: .LBB6_17: +; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-FAST-NEXT: jb .LBB6_19 +; X86-FAST-NEXT: # %bb.18: +; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-FAST-NEXT: .LBB6_19: +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: cmpl $64, %ebx +; X86-FAST-NEXT: jb .LBB6_21 +; X86-FAST-NEXT: # %bb.20: +; X86-FAST-NEXT: xorl %esi, %esi +; X86-FAST-NEXT: .LBB6_21: +; X86-FAST-NEXT: addb $-64, %ch +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movb %ch, %cl +; X86-FAST-NEXT: shrl %cl, %eax +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: testb $32, %ch +; X86-FAST-NEXT: movl $0, %eax +; X86-FAST-NEXT: jne .LBB6_23 +; X86-FAST-NEXT: # %bb.22: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-FAST-NEXT: .LBB6_23: +; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-FAST-NEXT: jae .LBB6_25 +; X86-FAST-NEXT: # %bb.24: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-FAST-NEXT: .LBB6_25: +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-FAST-NEXT: movb %ch, %cl +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: shrdl %cl, %eax, %ebp +; X86-FAST-NEXT: testb $32, %ch +; X86-FAST-NEXT: jne .LBB6_27 +; X86-FAST-NEXT: # %bb.26: +; X86-FAST-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: .LBB6_27: +; X86-FAST-NEXT: cmpl $64, %ebx +; X86-FAST-NEXT: jb .LBB6_29 +; X86-FAST-NEXT: # %bb.28: +; X86-FAST-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-FAST-NEXT: .LBB6_29: +; X86-FAST-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-FAST-NEXT: jae .LBB6_31 +; X86-FAST-NEXT: # %bb.30: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: .LBB6_31: +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: shldl $1, %eax, %ebp +; X86-FAST-NEXT: movl %ebp, %eax +; X86-FAST-NEXT: movl %ebx, %ecx +; X86-FAST-NEXT: shldl %cl, %edi, %eax +; X86-FAST-NEXT: testb $32, %bl +; X86-FAST-NEXT: jne .LBB6_33 +; X86-FAST-NEXT: # %bb.32: +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: .LBB6_33: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-FAST-NEXT: movb %dh, %cl +; X86-FAST-NEXT: shldl %cl, %esi, %eax +; X86-FAST-NEXT: testb $32, %dh +; X86-FAST-NEXT: jne .LBB6_35 +; X86-FAST-NEXT: # %bb.34: +; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-FAST-NEXT: .LBB6_35: +; X86-FAST-NEXT: movb $64, %cl +; X86-FAST-NEXT: subb %bl, %cl +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-FAST-NEXT: shrdl %cl, %eax, %esi +; X86-FAST-NEXT: shrl %cl, %eax +; X86-FAST-NEXT: testb $32, %cl +; X86-FAST-NEXT: je .LBB6_37 +; X86-FAST-NEXT: # %bb.36: +; X86-FAST-NEXT: movl %eax, %esi +; X86-FAST-NEXT: xorl %eax, %eax +; X86-FAST-NEXT: .LBB6_37: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-FAST-NEXT: cmpl $64, %ebx +; X86-FAST-NEXT: jae .LBB6_39 +; X86-FAST-NEXT: # %bb.38: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-FAST-NEXT: orl %eax, %ecx +; X86-FAST-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-FAST-NEXT: .LBB6_39: +; X86-FAST-NEXT: cmpl $64, %ebx +; X86-FAST-NEXT: jae .LBB6_41 +; X86-FAST-NEXT: # %bb.40: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-FAST-NEXT: orl %esi, %eax +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: .LBB6_41: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: testl %ebx, %ebx +; X86-FAST-NEXT: je .LBB6_43 +; X86-FAST-NEXT: # %bb.42: +; X86-FAST-NEXT: movl (%esp), %ebp # 4-byte Reload +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-FAST-NEXT: .LBB6_43: +; X86-FAST-NEXT: orl %edx, %ebp +; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-FAST-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-FAST-NEXT: je .LBB6_45 +; X86-FAST-NEXT: # %bb.44: +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: .LBB6_45: +; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-FAST-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-FAST-NEXT: movl %ecx, 4(%eax) +; X86-FAST-NEXT: movl %esi, (%eax) +; X86-FAST-NEXT: movl %ebp, 12(%eax) +; X86-FAST-NEXT: movl %edi, 8(%eax) +; X86-FAST-NEXT: addl $76, %esp +; X86-FAST-NEXT: popl %esi +; X86-FAST-NEXT: popl %edi +; X86-FAST-NEXT: popl %ebx +; X86-FAST-NEXT: popl %ebp +; X86-FAST-NEXT: retl $4 +; +; X86-SLOW-LABEL: var_shift_i128: +; X86-SLOW: # %bb.0: +; X86-SLOW-NEXT: pushl %ebp +; X86-SLOW-NEXT: pushl %ebx +; X86-SLOW-NEXT: pushl %edi +; X86-SLOW-NEXT: pushl %esi +; X86-SLOW-NEXT: subl $72, %esp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: andl $127, %eax +; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %eax, %edx +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: leal (%edi,%edi), %ebp +; X86-SLOW-NEXT: notb %al +; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: shll %cl, %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: leal (%esi,%esi), %ebx +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: shll %cl, %ebx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: testb $32, %dl +; X86-SLOW-NEXT: jne .LBB6_1 +; X86-SLOW-NEXT: # %bb.2: +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-SLOW-NEXT: orl %edi, %ebx +; X86-SLOW-NEXT: movl %ebx, %esi +; X86-SLOW-NEXT: jmp .LBB6_3 +; X86-SLOW-NEXT: .LBB6_1: +; X86-SLOW-NEXT: movl %eax, %ebp +; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: .LBB6_3: +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-SLOW-NEXT: jb .LBB6_5 +; X86-SLOW-NEXT: # %bb.4: +; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: .LBB6_5: +; X86-SLOW-NEXT: leal (%ecx,%ecx), %esi +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: notl %ebx +; X86-SLOW-NEXT: andl $127, %ebx +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: shrl $31, %ecx +; X86-SLOW-NEXT: leal (%ecx,%edi,2), %ecx +; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ecx, %edi +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: testb $32, %bl +; X86-SLOW-NEXT: movl $0, %edi +; X86-SLOW-NEXT: movl $0, %ecx +; X86-SLOW-NEXT: jne .LBB6_7 +; X86-SLOW-NEXT: # %bb.6: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-SLOW-NEXT: movl %esi, %ecx +; X86-SLOW-NEXT: .LBB6_7: +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: movl %edi, %ecx +; X86-SLOW-NEXT: shrl $31, %ecx +; X86-SLOW-NEXT: leal (%ecx,%eax,2), %esi +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: notb %cl +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: testb $32, %bl +; X86-SLOW-NEXT: jne .LBB6_9 +; X86-SLOW-NEXT: # %bb.8: +; X86-SLOW-NEXT: orl %edi, %esi +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: .LBB6_9: +; X86-SLOW-NEXT: movb %bl, %dh +; X86-SLOW-NEXT: addb $-64, %dh +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SLOW-NEXT: movb %dh, %cl +; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: testb $32, %dh +; X86-SLOW-NEXT: movl $0, %ecx +; X86-SLOW-NEXT: jne .LBB6_11 +; X86-SLOW-NEXT: # %bb.10: +; X86-SLOW-NEXT: movl %esi, %ecx +; X86-SLOW-NEXT: .LBB6_11: +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: jb .LBB6_13 +; X86-SLOW-NEXT: # %bb.12: +; X86-SLOW-NEXT: xorl %ebp, %ebp +; X86-SLOW-NEXT: .LBB6_13: +; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movb $64, %ch +; X86-SLOW-NEXT: movb $64, %ah +; X86-SLOW-NEXT: subb %dl, %ah +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: movb %ah, %cl +; X86-SLOW-NEXT: shll %cl, %ebp +; X86-SLOW-NEXT: notb %cl +; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: shrl %edi +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: movb %ah, %cl +; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: testb $32, %ah +; X86-SLOW-NEXT: jne .LBB6_14 +; X86-SLOW-NEXT: # %bb.15: +; X86-SLOW-NEXT: orl %edi, %ebp +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebp, %esi +; X86-SLOW-NEXT: jmp .LBB6_16 +; X86-SLOW-NEXT: .LBB6_14: +; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: .LBB6_16: +; X86-SLOW-NEXT: addb $-64, %dl +; X86-SLOW-NEXT: movb %dl, %cl +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: notb %cl +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-SLOW-NEXT: shll %cl, %ebp +; X86-SLOW-NEXT: movb %dl, %cl +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: testb $32, %dl +; X86-SLOW-NEXT: jne .LBB6_17 +; X86-SLOW-NEXT: # %bb.18: +; X86-SLOW-NEXT: orl %eax, %ebp +; X86-SLOW-NEXT: cmpl $64, %ebx +; X86-SLOW-NEXT: jae .LBB6_20 +; X86-SLOW-NEXT: jmp .LBB6_21 +; X86-SLOW-NEXT: .LBB6_17: +; X86-SLOW-NEXT: movl %edi, %ebp +; X86-SLOW-NEXT: xorl %edi, %edi +; X86-SLOW-NEXT: cmpl $64, %ebx +; X86-SLOW-NEXT: jb .LBB6_21 +; X86-SLOW-NEXT: .LBB6_20: +; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SLOW-NEXT: .LBB6_21: +; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-SLOW-NEXT: jae .LBB6_23 +; X86-SLOW-NEXT: # %bb.22: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-SLOW-NEXT: orl %esi, %edi +; X86-SLOW-NEXT: .LBB6_23: +; X86-SLOW-NEXT: cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: jb .LBB6_24 +; X86-SLOW-NEXT: # %bb.25: +; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: jmp .LBB6_26 +; X86-SLOW-NEXT: .LBB6_24: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: .LBB6_26: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: shrl $31, %eax +; X86-SLOW-NEXT: leal (%eax,%esi,2), %esi +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movb %bl, %cl +; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-SLOW-NEXT: shrl %edi +; X86-SLOW-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: testb $32, %bl +; X86-SLOW-NEXT: jne .LBB6_28 +; X86-SLOW-NEXT: # %bb.27: +; X86-SLOW-NEXT: orl %edi, %esi +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: .LBB6_28: +; X86-SLOW-NEXT: movl %ebp, %eax +; X86-SLOW-NEXT: movb %dh, %cl +; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: notb %cl +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: testb $32, %dh +; X86-SLOW-NEXT: jne .LBB6_30 +; X86-SLOW-NEXT: # %bb.29: +; X86-SLOW-NEXT: orl %esi, %eax +; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: .LBB6_30: +; X86-SLOW-NEXT: subb %bl, %ch +; X86-SLOW-NEXT: movl %ebp, %eax +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: addl %ebp, %ebp +; X86-SLOW-NEXT: notb %cl +; X86-SLOW-NEXT: shll %cl, %ebp +; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: movl %ebp, %esi +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-SLOW-NEXT: shrl %cl, %ebp +; X86-SLOW-NEXT: testb $32, %ch +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-SLOW-NEXT: jne .LBB6_31 +; X86-SLOW-NEXT: # %bb.32: +; X86-SLOW-NEXT: orl %ebp, %esi +; X86-SLOW-NEXT: movl %esi, %ebp +; X86-SLOW-NEXT: cmpl $64, %ebx +; X86-SLOW-NEXT: jb .LBB6_34 +; X86-SLOW-NEXT: jmp .LBB6_35 +; X86-SLOW-NEXT: .LBB6_31: +; X86-SLOW-NEXT: movl %eax, %ebp +; X86-SLOW-NEXT: xorl %eax, %eax +; X86-SLOW-NEXT: cmpl $64, %ebx +; X86-SLOW-NEXT: jae .LBB6_35 +; X86-SLOW-NEXT: .LBB6_34: +; X86-SLOW-NEXT: movl %ebp, %esi +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-SLOW-NEXT: orl %eax, %ebp +; X86-SLOW-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %esi, %ebp +; X86-SLOW-NEXT: .LBB6_35: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SLOW-NEXT: cmpl $64, %ebx +; X86-SLOW-NEXT: jae .LBB6_37 +; X86-SLOW-NEXT: # %bb.36: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SLOW-NEXT: orl %ebp, %eax +; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: .LBB6_37: +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: testl %ebx, %ebx +; X86-SLOW-NEXT: je .LBB6_39 +; X86-SLOW-NEXT: # %bb.38: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-SLOW-NEXT: .LBB6_39: +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-SLOW-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-SLOW-NEXT: je .LBB6_41 +; X86-SLOW-NEXT: # %bb.40: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-SLOW-NEXT: .LBB6_41: +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-SLOW-NEXT: orl %ecx, %ebx +; X86-SLOW-NEXT: orl %ebp, %edx +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-SLOW-NEXT: movl %ebx, (%eax) +; X86-SLOW-NEXT: movl %esi, 12(%eax) +; X86-SLOW-NEXT: movl %edx, 4(%eax) +; X86-SLOW-NEXT: movl %edi, 8(%eax) +; X86-SLOW-NEXT: addl $72, %esp +; X86-SLOW-NEXT: popl %esi +; X86-SLOW-NEXT: popl %edi +; X86-SLOW-NEXT: popl %ebx +; X86-SLOW-NEXT: popl %ebp +; X86-SLOW-NEXT: retl $4 +; +; X64-FAST-LABEL: var_shift_i128: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: movq %r8, %r10 +; X64-FAST-NEXT: movq %rcx, %r9 +; X64-FAST-NEXT: movq %rdx, %r8 +; X64-FAST-NEXT: movq %rsi, %rdx +; X64-FAST-NEXT: movl %r10d, %ecx +; X64-FAST-NEXT: shrdq %cl, %r9, %r8 +; X64-FAST-NEXT: shrq %cl, %r9 +; X64-FAST-NEXT: xorl %eax, %eax +; X64-FAST-NEXT: testb $64, %r10b +; X64-FAST-NEXT: cmovneq %r9, %r8 +; X64-FAST-NEXT: cmovneq %rax, %r9 +; X64-FAST-NEXT: shldq $1, %rdi, %rdx +; X64-FAST-NEXT: addq %rdi, %rdi +; X64-FAST-NEXT: notb %r10b +; X64-FAST-NEXT: movl %r10d, %ecx +; X64-FAST-NEXT: shldq %cl, %rdi, %rdx +; X64-FAST-NEXT: shlq %cl, %rdi +; X64-FAST-NEXT: testb $64, %r10b +; X64-FAST-NEXT: cmovneq %rdi, %rdx +; X64-FAST-NEXT: cmoveq %rdi, %rax +; X64-FAST-NEXT: orq %r8, %rax +; X64-FAST-NEXT: orq %r9, %rdx +; X64-FAST-NEXT: retq +; +; X64-SLOW-LABEL: var_shift_i128: +; X64-SLOW: # %bb.0: +; X64-SLOW-NEXT: movq %rcx, %r10 +; X64-SLOW-NEXT: movq %rdx, %r9 +; X64-SLOW-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; X64-SLOW-NEXT: andq %rdi, %rax +; X64-SLOW-NEXT: movl %r8d, %ecx +; X64-SLOW-NEXT: shrq %cl, %rax +; X64-SLOW-NEXT: movq %rdi, %rcx +; X64-SLOW-NEXT: shrq $63, %rcx +; X64-SLOW-NEXT: leaq (%rcx,%rsi,2), %rdx +; X64-SLOW-NEXT: movl %r8d, %r11d +; X64-SLOW-NEXT: notb %r11b +; X64-SLOW-NEXT: movl %r11d, %ecx +; X64-SLOW-NEXT: shlq %cl, %rdx +; X64-SLOW-NEXT: orq %rax, %rdx +; X64-SLOW-NEXT: movl %r8d, %ecx +; X64-SLOW-NEXT: shrq %cl, %r9 +; X64-SLOW-NEXT: leaq (%r10,%r10), %rsi +; X64-SLOW-NEXT: movl %r11d, %ecx +; X64-SLOW-NEXT: shlq %cl, %rsi +; X64-SLOW-NEXT: orq %r9, %rsi +; X64-SLOW-NEXT: movl %r8d, %ecx +; X64-SLOW-NEXT: shrq %cl, %r10 +; X64-SLOW-NEXT: xorl %eax, %eax +; X64-SLOW-NEXT: testb $64, %r8b +; X64-SLOW-NEXT: cmovneq %r10, %rsi +; X64-SLOW-NEXT: cmovneq %rax, %r10 +; X64-SLOW-NEXT: addq %rdi, %rdi +; X64-SLOW-NEXT: movl %r11d, %ecx +; X64-SLOW-NEXT: shlq %cl, %rdi +; X64-SLOW-NEXT: testb $64, %r11b +; X64-SLOW-NEXT: cmovneq %rdi, %rdx +; X64-SLOW-NEXT: cmoveq %rdi, %rax +; X64-SLOW-NEXT: orq %rsi, %rax +; X64-SLOW-NEXT: orq %r10, %rdx +; X64-SLOW-NEXT: retq + %tmp = tail call i128 @llvm.fshr.i128(i128 %x, i128 %y, i128 %z) + ret i128 %tmp +} + ; ; Const Funnel Shift ; From 55927105dd7cd5d4c4da2a4e88e802a034ba54fd Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 16 Aug 2021 12:49:59 -0400 Subject: [PATCH 127/700] [NFC] Trim trailing whitespaces in `llvm/CMakeLists.txt` --- llvm/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index db6d119156717..34392f3308d0e 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -803,8 +803,8 @@ if (TENSORFLOW_C_LIB_PATH) include_directories(${TENSORFLOW_C_LIB_PATH}/include) if (NOT TF_PROTO_HEADERS) message(STATUS "TF_PROTO_HEADERS not defined. Looking for tensorflow pip package.") - execute_process(COMMAND - ${Python3_EXECUTABLE} "-m" "pip" "show" "tensorflow" + execute_process(COMMAND + ${Python3_EXECUTABLE} "-m" "pip" "show" "tensorflow" OUTPUT_VARIABLE TF_PIP_OUT) if ("${TF_PIP_OUT}" STREQUAL "") message(FATAL ERROR "Tensorflow pip package is also required for 'development' mode (protobuf headers)") @@ -836,7 +836,7 @@ if (NOT TENSORFLOW_AOT_PATH STREQUAL "") include_directories(${TENSORFLOW_AOT_PATH}/include) add_subdirectory(${TENSORFLOW_AOT_PATH}/xla_aot_runtime_src ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/tf_runtime) - install(TARGETS tf_xla_runtime EXPORT LLVMExports + install(TARGETS tf_xla_runtime EXPORT LLVMExports ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT tf_xla_runtime) set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS tf_xla_runtime) endif() From 4357562067003e25ab343a2d67a60bd89cd66dbf Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Mon, 16 Aug 2021 10:04:32 -0700 Subject: [PATCH 128/700] [NFC][AArch64] Fix unused var in release build --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 5497956c4580d..9bbeb151d56bc 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10461,6 +10461,7 @@ SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, if (isTypeLegal(Op.getOperand(0).getValueType())) { unsigned NumOperands = Op->getNumOperands(); + (void)NumOperands; assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); From b7425e956be60a73004d7ae5bb37da85872c29fb Mon Sep 17 00:00:00 2001 From: Rong Xu Date: Mon, 16 Aug 2021 10:15:30 -0700 Subject: [PATCH 129/700] [NFC] Fix typos s/senstive/senstive/g --- .../StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp | 2 +- clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp | 2 +- compiler-rt/test/profile/Linux/instrprof-cs.c | 2 +- llvm/include/llvm/Transforms/Instrumentation.h | 4 ++-- llvm/lib/ProfileData/SampleProfReader.cpp | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp index 175dfcef0df45..a13de306eac84 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file defines a CheckObjCInstMethSignature, a flow-insenstive check +// This file defines a CheckObjCInstMethSignature, a flow-insensitive check // that determines if an Objective-C class interface incorrectly redefines // the method signature in a subclass. // diff --git a/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp index 90c5583d89691..dcca8be55e337 100644 --- a/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file defines a CheckNSError, a flow-insenstive check +// This file defines a CheckNSError, a flow-insensitive check // that determines if an Objective-C class interface correctly returns // a non-void return type. // diff --git a/compiler-rt/test/profile/Linux/instrprof-cs.c b/compiler-rt/test/profile/Linux/instrprof-cs.c index d825525a532db..0ad6f0350c560 100644 --- a/compiler-rt/test/profile/Linux/instrprof-cs.c +++ b/compiler-rt/test/profile/Linux/instrprof-cs.c @@ -8,7 +8,7 @@ // RUN: %clang_profgen=%t.profraw -o %t.gen.cis -O2 %s // RUN: %run %t.gen.cis // RUN: llvm-profdata merge -o %t.cis.profdata %t.profraw -// Check context insenstive profile +// Check context insensitive profile // RUN: %clang_profuse=%t.cis.profdata -O2 -emit-llvm -S %s -o - | FileCheck %s --check-prefix=CIS int g1 = 1; int volatile g2 = 2; diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h index 03108bacb0da5..0c822999aecf3 100644 --- a/llvm/include/llvm/Transforms/Instrumentation.h +++ b/llvm/include/llvm/Transforms/Instrumentation.h @@ -78,7 +78,7 @@ struct GCOVOptions { ModulePass *createGCOVProfilerPass(const GCOVOptions &Options = GCOVOptions::getDefault()); -// PGO Instrumention. Parameter IsCS indicates if this is the context senstive +// PGO Instrumention. Parameter IsCS indicates if this is the context sensitive // instrumentation. ModulePass *createPGOInstrumentationGenLegacyPass(bool IsCS = false); ModulePass * @@ -138,7 +138,7 @@ struct InstrProfOptions { }; /// Insert frontend instrumentation based profiling. Parameter IsCS indicates if -// this is the context senstive instrumentation. +// this is the context sensitive instrumentation. ModulePass *createInstrProfilingLegacyPass( const InstrProfOptions &Options = InstrProfOptions(), bool IsCS = false); diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp index 6058eddb13dc7..a801ca1ef36d7 100644 --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -53,7 +53,7 @@ using namespace sampleprof; // For ext-binary format profiles, the flag is set in the summary. static cl::opt ProfileIsFSDisciminator( "profile-isfs", cl::Hidden, cl::init(false), - cl::desc("Profile uses flow senstive discriminators")); + cl::desc("Profile uses flow sensitive discriminators")); /// Dump the function profile for \p FName. /// From 5fc841d8a278ea16bae457deba35d0db6b716dd6 Mon Sep 17 00:00:00 2001 From: Kostya Kortchinsky Date: Sun, 15 Aug 2021 09:09:46 -0700 Subject: [PATCH 130/700] [scudo] Use stdint types for internal scudo types `scudo::uptr` was defined as an `unsigned long` on 32-b platform, while a `uintptr_t` is usually defined as an `unsigned int`. This worked, this was not consistent, particularly with regard to format string specifiers. As suggested by Vitaly, since we are including `stdint.h`, define the internal `scudo` integer types to those. Differential Revision: https://reviews.llvm.org/D108089 --- .../lib/scudo/standalone/internal_defs.h | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/internal_defs.h b/compiler-rt/lib/scudo/standalone/internal_defs.h index c9ffad136b78d..621fc9c45e952 100644 --- a/compiler-rt/lib/scudo/standalone/internal_defs.h +++ b/compiler-rt/lib/scudo/standalone/internal_defs.h @@ -78,16 +78,16 @@ namespace scudo { -typedef unsigned long uptr; -typedef unsigned char u8; -typedef unsigned short u16; -typedef unsigned int u32; -typedef unsigned long long u64; -typedef signed long sptr; -typedef signed char s8; -typedef signed short s16; -typedef signed int s32; -typedef signed long long s64; +typedef uintptr_t uptr; +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef intptr_t sptr; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; // The following two functions have platform specific implementations. void outputRaw(const char *Buffer); From 8bc72dede68ccbbf828c0421276d962d369ba70f Mon Sep 17 00:00:00 2001 From: Alfsonso Gregory Date: Mon, 16 Aug 2021 19:07:50 +0100 Subject: [PATCH 131/700] [Clang][AST][NFC] Resolve FIXME: Remove unused QualType ElementType member from the ASTContext class. It is completely unused and not needed to be kept, so let us remove it. Differential Revision: https://reviews.llvm.org/D107719 --- clang/include/clang/AST/Type.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 4238667b8b076..fc83c895afa2e 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -3455,10 +3455,6 @@ class ConstantMatrixType final : public MatrixType { protected: friend class ASTContext; - /// The element type of the matrix. - // FIXME: Appears to be unused? There is also MatrixType::ElementType... - QualType ElementType; - /// Number of rows and columns. unsigned NumRows; unsigned NumColumns; From 80ed75e7fb45f9f5fc84ca7cbe258be036015384 Mon Sep 17 00:00:00 2001 From: Kostya Kortchinsky Date: Mon, 16 Aug 2021 11:12:35 -0700 Subject: [PATCH 132/700] Revert "[NFC] Fix typos" This reverts commit b7425e956be60a73004d7ae5bb37da85872c29fb. --- .../StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp | 2 +- clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp | 2 +- compiler-rt/test/profile/Linux/instrprof-cs.c | 2 +- llvm/include/llvm/Transforms/Instrumentation.h | 4 ++-- llvm/lib/ProfileData/SampleProfReader.cpp | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp index a13de306eac84..175dfcef0df45 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file defines a CheckObjCInstMethSignature, a flow-insensitive check +// This file defines a CheckObjCInstMethSignature, a flow-insenstive check // that determines if an Objective-C class interface incorrectly redefines // the method signature in a subclass. // diff --git a/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp index dcca8be55e337..90c5583d89691 100644 --- a/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file defines a CheckNSError, a flow-insensitive check +// This file defines a CheckNSError, a flow-insenstive check // that determines if an Objective-C class interface correctly returns // a non-void return type. // diff --git a/compiler-rt/test/profile/Linux/instrprof-cs.c b/compiler-rt/test/profile/Linux/instrprof-cs.c index 0ad6f0350c560..d825525a532db 100644 --- a/compiler-rt/test/profile/Linux/instrprof-cs.c +++ b/compiler-rt/test/profile/Linux/instrprof-cs.c @@ -8,7 +8,7 @@ // RUN: %clang_profgen=%t.profraw -o %t.gen.cis -O2 %s // RUN: %run %t.gen.cis // RUN: llvm-profdata merge -o %t.cis.profdata %t.profraw -// Check context insensitive profile +// Check context insenstive profile // RUN: %clang_profuse=%t.cis.profdata -O2 -emit-llvm -S %s -o - | FileCheck %s --check-prefix=CIS int g1 = 1; int volatile g2 = 2; diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h index 0c822999aecf3..03108bacb0da5 100644 --- a/llvm/include/llvm/Transforms/Instrumentation.h +++ b/llvm/include/llvm/Transforms/Instrumentation.h @@ -78,7 +78,7 @@ struct GCOVOptions { ModulePass *createGCOVProfilerPass(const GCOVOptions &Options = GCOVOptions::getDefault()); -// PGO Instrumention. Parameter IsCS indicates if this is the context sensitive +// PGO Instrumention. Parameter IsCS indicates if this is the context senstive // instrumentation. ModulePass *createPGOInstrumentationGenLegacyPass(bool IsCS = false); ModulePass * @@ -138,7 +138,7 @@ struct InstrProfOptions { }; /// Insert frontend instrumentation based profiling. Parameter IsCS indicates if -// this is the context sensitive instrumentation. +// this is the context senstive instrumentation. ModulePass *createInstrProfilingLegacyPass( const InstrProfOptions &Options = InstrProfOptions(), bool IsCS = false); diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp index a801ca1ef36d7..6058eddb13dc7 100644 --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -53,7 +53,7 @@ using namespace sampleprof; // For ext-binary format profiles, the flag is set in the summary. static cl::opt ProfileIsFSDisciminator( "profile-isfs", cl::Hidden, cl::init(false), - cl::desc("Profile uses flow sensitive discriminators")); + cl::desc("Profile uses flow senstive discriminators")); /// Dump the function profile for \p FName. /// From 427c9aa7c440aa9003e322a3107f8b222fa17ef4 Mon Sep 17 00:00:00 2001 From: Kostya Kortchinsky Date: Mon, 16 Aug 2021 11:13:48 -0700 Subject: [PATCH 133/700] Revert "[scudo] Use stdint types for internal scudo types" This reverts commit 5fc841d8a278ea16bae457deba35d0db6b716dd6. --- .../lib/scudo/standalone/internal_defs.h | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/internal_defs.h b/compiler-rt/lib/scudo/standalone/internal_defs.h index 621fc9c45e952..c9ffad136b78d 100644 --- a/compiler-rt/lib/scudo/standalone/internal_defs.h +++ b/compiler-rt/lib/scudo/standalone/internal_defs.h @@ -78,16 +78,16 @@ namespace scudo { -typedef uintptr_t uptr; -typedef uint8_t u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint64_t u64; -typedef intptr_t sptr; -typedef int8_t s8; -typedef int16_t s16; -typedef int32_t s32; -typedef int64_t s64; +typedef unsigned long uptr; +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned int u32; +typedef unsigned long long u64; +typedef signed long sptr; +typedef signed char s8; +typedef signed short s16; +typedef signed int s32; +typedef signed long long s64; // The following two functions have platform specific implementations. void outputRaw(const char *Buffer); From 0a031449b2c757400090b23bd6ddf4d896d32643 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sat, 14 Aug 2021 16:21:56 +0200 Subject: [PATCH 134/700] [PassBuilder] Don't use MemorySSA for standalone LoopRotate passes Two standalone LoopRotate passes scheduled using createFunctionToLoopPassAdaptor() currently enable MemorySSA. However, while LoopRotate can preserve MemorySSA, it does not use it, so requiring MemorySSA is unnecessary. This change doesn't have a practical compile-time impact by itself, because subsequent passes still request MemorySSA. Differential Revision: https://reviews.llvm.org/D108073 --- clang/test/CodeGen/thinlto-distributed-newpm.ll | 2 +- llvm/lib/Passes/PassBuilder.cpp | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/clang/test/CodeGen/thinlto-distributed-newpm.ll b/clang/test/CodeGen/thinlto-distributed-newpm.ll index c61a6ff7fbeb5..4651a5d282faf 100644 --- a/clang/test/CodeGen/thinlto-distributed-newpm.ll +++ b/clang/test/CodeGen/thinlto-distributed-newpm.ll @@ -134,7 +134,6 @@ ; CHECK-O: Running pass: LoopSimplifyPass on main ; CHECK-O: Running analysis: LoopAnalysis on main ; CHECK-O: Running pass: LCSSAPass on main -; CHECK-O: Running analysis: MemorySSAAnalysis on main ; CHECK-O: Running analysis: AAManager on main ; CHECK-O: Running analysis: BasicAA on main ; CHECK-O: Running analysis: ScalarEvolutionAnalysis on main @@ -147,6 +146,7 @@ ; CHECK-O: Running analysis: BranchProbabilityAnalysis on main ; CHECK-O: Running analysis: PostDominatorTreeAnalysis on main ; CHECK-O: Running analysis: DemandedBitsAnalysis on main +; CHECK-O: Running analysis: MemorySSAAnalysis on main ; CHECK-O: Running pass: LoopLoadEliminationPass on main ; CHECK-O: Running pass: InstCombinePass on main ; CHECK-O: Running pass: SimplifyCFGPass on main diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 2972687274f52..cdf0a732e6708 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -923,7 +923,7 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, FunctionPassManager FPM; // Disable header duplication in loop rotation at -Oz. FPM.addPass(createFunctionToLoopPassAdaptor( - LoopRotatePass(Level != OptimizationLevel::Oz), EnableMSSALoopDependency, + LoopRotatePass(Level != OptimizationLevel::Oz), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); @@ -1399,8 +1399,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // Disable header duplication at -Oz. OptimizePM.addPass(createFunctionToLoopPassAdaptor( LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink), - EnableMSSALoopDependency, - /*UseBlockFrequencyInfo=*/false)); + /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); // Distribute loops to allow partial vectorization. I.e. isolate dependences // into separate loop that would otherwise inhibit vectorization. This is From 570c9beb8ebb4bdcc807101518cc36ad5e20797c Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sat, 14 Aug 2021 17:55:08 +0200 Subject: [PATCH 135/700] [MemorySSA] Remove unnecessary MSSA dependencies LoopLoadElimination, LoopVersioning and LoopVectorize currently fetch MemorySSA when construction LoopAccessAnalysis. However, LoopAccessAnalysis does not actually use MemorySSA and we can pass nullptr instead. This saves one MemorySSA calculation in the default pipeline, and thus improves compile-time. Differential Revision: https://reviews.llvm.org/D108074 --- clang/test/CodeGen/thinlto-distributed-newpm.ll | 1 - llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp | 6 +----- llvm/lib/Transforms/Utils/LoopVersioning.cpp | 7 ++----- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +----- .../LoopVectorize/novect-lcssa-cfg-invalidation.ll | 1 - 5 files changed, 4 insertions(+), 17 deletions(-) diff --git a/clang/test/CodeGen/thinlto-distributed-newpm.ll b/clang/test/CodeGen/thinlto-distributed-newpm.ll index 4651a5d282faf..8f7fc5e9b8411 100644 --- a/clang/test/CodeGen/thinlto-distributed-newpm.ll +++ b/clang/test/CodeGen/thinlto-distributed-newpm.ll @@ -146,7 +146,6 @@ ; CHECK-O: Running analysis: BranchProbabilityAnalysis on main ; CHECK-O: Running analysis: PostDominatorTreeAnalysis on main ; CHECK-O: Running analysis: DemandedBitsAnalysis on main -; CHECK-O: Running analysis: MemorySSAAnalysis on main ; CHECK-O: Running pass: LoopLoadEliminationPass on main ; CHECK-O: Running pass: InstCombinePass on main ; CHECK-O: Running pass: SimplifyCFGPass on main diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index aaf586173e442..9c4f18f8e2213 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -34,7 +34,6 @@ #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -718,15 +717,12 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F, auto *PSI = MAMProxy.getCachedResult(*F.getParent()); auto *BFI = (PSI && PSI->hasProfileSummary()) ? &AM.getResult(F) : nullptr; - MemorySSA *MSSA = EnableMSSALoopDependency - ? &AM.getResult(F).getMSSA() - : nullptr; auto &LAM = AM.getResult(F).getManager(); bool Changed = eliminateLoadsAcrossLoops( F, LI, DT, BFI, PSI, &SE, &AC, [&](Loop &L) -> const LoopAccessInfo & { LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, - TLI, TTI, nullptr, MSSA}; + TLI, TTI, nullptr, nullptr}; return LAM.getResult(L, AR); }); diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index 8a89158788cf8..14439796fb4ae 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -14,9 +14,9 @@ #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Dominators.h" @@ -354,14 +354,11 @@ PreservedAnalyses LoopVersioningPass::run(Function &F, auto &TLI = AM.getResult(F); auto &AA = AM.getResult(F); auto &AC = AM.getResult(F); - MemorySSA *MSSA = EnableMSSALoopDependency - ? &AM.getResult(F).getMSSA() - : nullptr; auto &LAM = AM.getResult(F).getManager(); auto GetLAA = [&](Loop &L) -> const LoopAccessInfo & { LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, - TLI, TTI, nullptr, MSSA}; + TLI, TTI, nullptr, nullptr}; return LAM.getResult(L, AR); }; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index aac382af50c2b..b842d15fe1874 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -87,7 +87,6 @@ #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" -#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -10539,15 +10538,12 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, auto &AC = AM.getResult(F); auto &DB = AM.getResult(F); auto &ORE = AM.getResult(F); - MemorySSA *MSSA = EnableMSSALoopDependency - ? &AM.getResult(F).getMSSA() - : nullptr; auto &LAM = AM.getResult(F).getManager(); std::function GetLAA = [&](Loop &L) -> const LoopAccessInfo & { LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, - TLI, TTI, nullptr, MSSA}; + TLI, TTI, nullptr, nullptr}; return LAM.getResult(L, AR); }; auto &MAMProxy = AM.getResult(F); diff --git a/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll b/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll index 12e1b5e7ff2af..353fbe013f12f 100644 --- a/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll +++ b/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll @@ -13,7 +13,6 @@ define i32 @novect(i32* %p) { ; CHECK-NOT: Invalidating analysis: BranchProbabilityAnalysis on novect ; CHECK-NOT: Invalidating analysis: BlockFrequencyAnalysis on novect ; CHECK: Invalidating analysis: DemandedBitsAnalysis on novect -; CHECK: Invalidating analysis: MemorySSAAnalysis on novect ; CHECK: Running pass: JumpThreadingPass on novect ; CHECK: entry: From 19a906f372226e2ef491a355306afe6a2c35b354 Mon Sep 17 00:00:00 2001 From: Aart Bik Date: Mon, 16 Aug 2021 10:31:17 -0700 Subject: [PATCH 136/700] [mlir][sparse][python] make imports more selective Reviewed By: bixia Differential Revision: https://reviews.llvm.org/D108055 --- .../dialects/sparse_tensor/test_SpMM.py | 79 ++++++++++--------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/mlir/test/python/dialects/sparse_tensor/test_SpMM.py b/mlir/test/python/dialects/sparse_tensor/test_SpMM.py index 17ed92cb092ce..5b856bacd03a1 100644 --- a/mlir/test/python/dialects/sparse_tensor/test_SpMM.py +++ b/mlir/test/python/dialects/sparse_tensor/test_SpMM.py @@ -1,17 +1,19 @@ # RUN: SUPPORT_LIB=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext %PYTHON %s | FileCheck %s -import os import ctypes -import mlir.all_passes_registration import numpy as np +import os + +import mlir.all_passes_registration + +from mlir import ir +from mlir import runtime as rt +from mlir import execution_engine +from mlir import passmanager +from mlir.dialects import sparse_tensor as st from mlir.dialects import builtin -from mlir.dialects.linalg.opdsl.lang import * -from mlir.dialects.sparse_tensor import * -from mlir.execution_engine import * -from mlir.ir import * -from mlir.passmanager import * -from mlir.runtime import * +from mlir.dialects.linalg.opdsl import lang as dsl def run(f): @@ -20,28 +22,28 @@ def run(f): return f -@linalg_structured_op +@dsl.linalg_structured_op def matmul_dsl( - A=TensorDef(T, S.M, S.K), - B=TensorDef(T, S.K, S.N), - C=TensorDef(T, S.M, S.N, output=True)): - C[D.m, D.n] += A[D.m, D.k] * B[D.k, D.n] + A=dsl.TensorDef(dsl.T, dsl.S.M, dsl.S.K), + B=dsl.TensorDef(dsl.T, dsl.S.K, dsl.S.N), + C=dsl.TensorDef(dsl.T, dsl.S.M, dsl.S.N, output=True)): + C[dsl.D.m, dsl.D.n] += A[dsl.D.m, dsl.D.k] * B[dsl.D.k, dsl.D.n] -def build_SpMM(attr: EncodingAttr): +def build_SpMM(attr: st.EncodingAttr): """Build SpMM kernel. This method generates a linalg op with for matrix multiplication using just the Python API. Effectively, a generic linalg op is constructed that computes C(i,j) += A(i,k) * B(k,j) for annotated matrix A. """ - module = Module.create() + module = ir.Module.create() f64 = ir.F64Type.get() - a = RankedTensorType.get([3, 4], f64, attr) - b = RankedTensorType.get([4, 2], f64) - c = RankedTensorType.get([3, 2], f64) + a = ir.RankedTensorType.get([3, 4], f64, attr) + b = ir.RankedTensorType.get([4, 2], f64) + c = ir.RankedTensorType.get([3, 2], f64) arguments = [a, b, c] - with InsertionPoint(module.body): + with ir.InsertionPoint(module.body): @builtin.FuncOp.from_py_func(*arguments) def spMxM(*args): @@ -50,7 +52,7 @@ def spMxM(*args): return module -def boilerplate(attr: EncodingAttr): +def boilerplate(attr: st.EncodingAttr): """Returns boilerplate main method. This method sets up a boilerplate main method that calls the generated @@ -75,14 +77,15 @@ def boilerplate(attr: EncodingAttr): """ -def build_compile_and_run_SpMM(attr: EncodingAttr, support_lib: str, compiler): +def build_compile_and_run_SpMM(attr: st.EncodingAttr, support_lib: str, + compiler): # Build. module = build_SpMM(attr) func = str(module.operation.regions[0].blocks[0].operations[0].operation) - module = Module.parse(func + boilerplate(attr)) + module = ir.Module.parse(func + boilerplate(attr)) # Compile. compiler(module) - execution_engine = ExecutionEngine( + engine = execution_engine.ExecutionEngine( module, opt_level=0, shared_libs=[support_lib]) # Set up numpy input, invoke the kernel, and get numpy output. # Built-in bufferization uses in-out buffers. @@ -90,11 +93,11 @@ def build_compile_and_run_SpMM(attr: EncodingAttr, support_lib: str, compiler): Cin = np.zeros((3, 2), np.double) Cout = np.zeros((3, 2), np.double) Cin_memref_ptr = ctypes.pointer( - ctypes.pointer(get_ranked_memref_descriptor(Cin))) + ctypes.pointer(rt.get_ranked_memref_descriptor(Cin))) Cout_memref_ptr = ctypes.pointer( - ctypes.pointer(get_ranked_memref_descriptor(Cout))) - execution_engine.invoke('main', Cout_memref_ptr, Cin_memref_ptr) - Cresult = ranked_memref_to_numpy(Cout_memref_ptr[0]) + ctypes.pointer(rt.get_ranked_memref_descriptor(Cout))) + engine.invoke('main', Cout_memref_ptr, Cin_memref_ptr) + Cresult = rt.ranked_memref_to_numpy(Cout_memref_ptr[0]) # Sanity check on computed result. expected = [[12.3, 12.0], [0.0, 0.0], [16.5, 19.8]] @@ -121,8 +124,8 @@ def __init__(self, options: str): f'convert-std-to-llvm') self.pipeline = pipeline - def __call__(self, module: Module): - PassManager.parse(self.pipeline).run(module) + def __call__(self, module: ir.Module): + passmanager.PassManager.parse(self.pipeline).run(module) # CHECK-LABEL: TEST: testSpMM @@ -130,7 +133,7 @@ def __call__(self, module: Module): @run def testSpMM(): support_lib = os.getenv('SUPPORT_LIB') - with Context() as ctx, Location.unknown(): + with ir.Context() as ctx, ir.Location.unknown(): count = 0 # Fixed compiler optimization strategy. # TODO: explore state space here too @@ -144,20 +147,20 @@ def testSpMM(): # Exhaustive loop over various ways to annotate a kernel with # a *single* sparse tensor. Even this subset already gives # quite a large state space! - levels = [[DimLevelType.dense, DimLevelType.dense], - [DimLevelType.dense, DimLevelType.compressed], - [DimLevelType.compressed, DimLevelType.dense], - [DimLevelType.compressed, DimLevelType.compressed]] + levels = [[st.DimLevelType.dense, st.DimLevelType.dense], + [st.DimLevelType.dense, st.DimLevelType.compressed], + [st.DimLevelType.compressed, st.DimLevelType.dense], + [st.DimLevelType.compressed, st.DimLevelType.compressed]] orderings = [ - AffineMap.get_permutation([0, 1]), - AffineMap.get_permutation([1, 0]) + ir.AffineMap.get_permutation([0, 1]), + ir.AffineMap.get_permutation([1, 0]) ] bitwidths = [0, 8, 32] - for levels in levels: + for level in levels: for ordering in orderings: for pwidth in bitwidths: for iwidth in bitwidths: - attr = EncodingAttr.get(levels, ordering, pwidth, iwidth) + attr = st.EncodingAttr.get(level, ordering, pwidth, iwidth) compiler = SparseCompiler(options=opt) build_compile_and_run_SpMM(attr, support_lib, compiler) count = count + 1 From 65532ea6dd52a082d2033b0958088d17034a67de Mon Sep 17 00:00:00 2001 From: Robert Suderman Date: Mon, 16 Aug 2021 11:46:58 -0700 Subject: [PATCH 137/700] [mlir][linalg] Clear unused linalg tc operations These operations are not lowered to from any source dialect and are only used for redundant tests. Removing these named ops, along with their associated tests, will make migration to YAML operations much more convenient. Reviewed By: stellaraccident Differential Revision: https://reviews.llvm.org/D107993 --- .../Linalg/IR/LinalgNamedStructuredOps.yaml | 81 -------- .../Linalg/IR/LinalgNamedStructuredOpsSpec.tc | 181 ----------------- .../Linalg/Transforms/Vectorization.cpp | 18 -- .../linalg/opdsl/ops/core_named_ops.py | 18 -- .../Dialect/Linalg/generalize-named-ops.mlir | 134 ------------ .../generalize-named-polymorphic-ops.mlir | 30 --- mlir/test/Dialect/Linalg/named-ops.mlir | 114 ----------- ...mark_matmul_column_major_as_row_major.mlir | 136 ------------- ...est-conv-1d-input-ncw-filter-wcf-call.mlir | 70 ------- .../Linalg/CPU/test-conv-1d-ncw-call.mlir | 68 ------- .../Linalg/CPU/test-conv-1d-nwc-call.mlir | 79 -------- ...t-conv-2d-input-nchw-filter-hwcf-call.mlir | 83 -------- .../Linalg/CPU/test-conv-2d-nchw-call.mlir | 83 -------- .../Linalg/CPU/test-conv-2d-nhwc-call.mlir | 127 ------------ ...conv-3d-input-ncdhw-filter-dhwcf-call.mlir | 90 --------- .../Linalg/CPU/test-conv-3d-ncdhw-call.mlir | 88 -------- .../Linalg/CPU/test-conv-3d-ndhwc-call.mlir | 190 ------------------ .../integration/dialects/linalg/opsrun.py | 65 ------ 18 files changed, 1655 deletions(-) delete mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir delete mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-ncw-filter-wcf-call.mlir delete mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir delete mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir delete mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nchw-filter-hwcf-call.mlir delete mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir delete mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir delete mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ncdhw-filter-dhwcf-call.mlir delete mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir delete mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml index feb8e0c1ea5a4..b5199d1e40ad3 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml @@ -791,87 +791,6 @@ structured_op: !LinalgStructuredOpConfig - !ScalarExpression scalar_arg: K --- !LinalgOpConfig -metadata: !LinalgOpMetadata - name: depthwise_conv_2d_input_nhwc_filter_hwc_poly - cpp_class_name: DepthwiseConv2DInputNhwcFilterHwcPolyOp - doc: |- - Performs depth-wise 2-D convolution. - - Numeric casting is performed on the operands to the inner multiply, promoting - them to the same data type as the accumulator/output. -structured_op: !LinalgStructuredOpConfig - args: - - !LinalgOperandDefConfig - name: I - usage: InputOperand - type_var: T1 - shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] -> - (s0, s1, s2, s3)> - - !LinalgOperandDefConfig - name: K - usage: InputOperand - type_var: T2 - shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] -> - (s4, s5, s3)> - - !LinalgOperandDefConfig - name: O - usage: OutputOperand - type_var: U - shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] -> - (s0, s6, s7, s3)> - - !LinalgOperandDefConfig - name: strides - usage: IndexAttribute - type_var: I64 - attribute_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] - -> (s8, s9)> - - !LinalgOperandDefConfig - name: dilations - usage: IndexAttribute - type_var: I64 - attribute_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] - -> (s10, s11)> - indexing_maps: !LinalgIndexingMapsConfig - static_indexing_maps: - - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, - s10, s11] -> (d0, d1 * s8 + d3 * s10, d2 * s9 + d4 * s11, d5)> - - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, - s10, s11] -> (d3, d4, d5)> - - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, - s10, s11] -> (d0, d1, d2, d5)> - iterator_types: - - parallel - - parallel - - parallel - - reduction - - reduction - - parallel - assignments: - - !ScalarAssign - arg: O - value: !ScalarExpression - scalar_apply: - fn_name: add - operands: - - !ScalarExpression - scalar_arg: O - - !ScalarExpression - scalar_apply: - fn_name: mul - operands: - - !ScalarExpression - symbolic_cast: - type_var: U - operands: - - !ScalarExpression - scalar_arg: I - - !ScalarExpression - symbolic_cast: - type_var: U - operands: - - !ScalarExpression - scalar_arg: K ---- !LinalgOpConfig metadata: !LinalgOpMetadata name: conv_2d_nhwc_hwcf_q cpp_class_name: Conv2DNhwcHwcfQOp diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc index e792c110eab61..7f0f7deab3e74 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc @@ -1,9 +1,3 @@ -ods_def -implements_interface : -def matmul_column_major(A: f32(K, M), B: f32(N, K)) -> (C: f32(N, M)) { - C(n, m) = AddFOp(C(n, m), MulFOp(A(k, m), B(n, k))); -} - ods_def implements_interface : def matmul_i8_i8_i32(A: i8(M, K), B: i8(K, N)) -> (C: i32(M, N)) { @@ -12,139 +6,22 @@ def matmul_i8_i8_i32(A: i8(M, K), B: i8(K, N)) -> (C: i32(M, N)) { C(m, n) = AddIOp(C(m, n), MulIOp(SignExtendIOp32(A(m, k)), SignExtendIOp32(B(k, n)))); } -ods_def -implements_interface : -def matmul_i16_i16_i32(A: i16(M, K), B: i16(K, N)) -> (C: i32(M, N)) { - C(m, n) = AddIOp(C(m, n), MulIOp(SignExtendIOp32(A(m, k)), SignExtendIOp32(B(k, n)))); -} - -ods_def -implements_interface : -def matmul_i32_i32_i32(A: i32(M, K), B: i32(K, N)) -> (C: i32(M, N)) { - C(m, n) = AddIOp(C(m, n), MulIOp(A(m, k), B(k, n))); -} - -ods_def -implements_interface : -def matvec_i8_i8_i32(A: i8(M, N), y: i8(N)) -> (x: i32(M)) { - x(m) = AddIOp(x(m), MulIOp(SignExtendIOp32(A(m, n)), SignExtendIOp32(y(n)))); -} - -ods_def -implements_interface : -def matvec_i16_i16_i32(A: i16(M, N), y: i16(N)) -> (x: i32(M)) { - x(m) = AddIOp(x(m), MulIOp(SignExtendIOp32(A(m, n)), SignExtendIOp32(y(n)))); -} - -ods_def -implements_interface : -def matvec_i32_i32_i32(A: i32(M, N), y: i32(N)) -> (x: i32(M)) { - x(m) = AddIOp(x(m), MulIOp(A(m, n), y(n))); -} - -ods_def -implements_interface : -def vecmat_i8_i8_i32(y: i8(M), A: i8(M, N)) -> (x: i32(N)) { - x(n) = AddIOp(x(n), MulIOp(SignExtendIOp32(y(m)), SignExtendIOp32(A(m, n)))); -} - -ods_def -implements_interface : -def vecmat_i16_i16_i32(y: i16(M), A: i16(M, N)) -> (x: i32(N)) { - x(n) = AddIOp(x(n), MulIOp(SignExtendIOp32(y(m)), SignExtendIOp32(A(m, n)))); -} - -ods_def -implements_interface : -def vecmat_i32_i32_i32(y: i32(M), A: i32(M, N)) -> (x: i32(N)) { - x(n) = AddIOp(x(n), MulIOp(y(m), A(m, n))); -} - -ods_def -implements_interface : -def dot_i8_i8_i32(A: i8(M), B: i8(M)) -> (C: i32()) { - C() = AddIOp(C(), MulIOp(SignExtendIOp32(A(m)), SignExtendIOp32(B(m)))); -} - -ods_def -implements_interface : -def dot_i16_i16_i32(A: i16(M), B: i16(M)) -> (C: i32()) { - C() = AddIOp(C(), MulIOp(SignExtendIOp32(A(m)), SignExtendIOp32(B(m)))); -} - -ods_def -implements_interface : -def dot_i32_i32_i32(A: i32(M), B: i32(M)) -> (C: i32()) { - C() = AddIOp(C(), MulIOp(A(m), B(m))); -} - -ods_def -implements_interface : -def batch_matmul_i8_i8_i32(A: i8(Batch, M, K), B: i8(Batch, K, N)) -> (C: i32(Batch, M, N)) { - C(b, m, n) = - AddIOp(C(b, m, n), MulIOp(SignExtendIOp32(A(b, m, k)), SignExtendIOp32(B(b, k, n)))); -} - -ods_def -implements_interface : -def batch_matmul_i16_i16_i32(A: i16(Batch, M, K), B: i16(Batch, K, N)) -> (C: i32(Batch, M, N)) { - C(b, m, n) = - AddIOp(C(b, m, n), MulIOp(SignExtendIOp32(A(b, m, k)), SignExtendIOp32(B(b, k, n)))); -} - - -ods_def -implements_interface : -def batch_matmul_i32_i32_i32(A: i32(Batch, M, K), B: i32(Batch, K, N)) -> (C: i32(Batch, M, N)) { - C(b, m, n) = AddIOp(C(b, m, n), MulIOp(A(b, m, k), B(b, k, n))); -} - ods_def: def conv_1d(I: f32(W), K: f32(KW)) -> (O: f32(W)) { O(w) = AddFOp(O(w), MulFOp(I(w + kw), K(kw))); } -ods_def: -def conv_1d_nwc(I: f32(N, W, C), K: f32(F, KW, C)) -> (O: f32(N, W, F)) { - O(n, w, f) = AddFOp(O(n, w, f), MulFOp(I(n, w + kw, c), K(f, kw, c))); -} - -ods_def: -def conv_1d_ncw(I: f32(N, C, W), K: f32(F, C, KW)) -> (O: f32(N, F, W)) { - O(n, f, w) = AddFOp(O(n, f, w), MulFOp(I(n, c, w + kw), K(f, c, kw))); -} - ods_def: def conv_2d(I: f32(H, W), K: f32(KH, KW)) -> (O: f32(H, W)) { O(h, w) = AddFOp(O(h, w), MulFOp(I(h + kh, w + kw), K(kh, kw))); } -ods_def: -def conv_2d_nhwc(I: f32(N, H, W, C), K: f32(F, KH, KW, C)) -> (O: f32(N, H, W, F)) { - O(n, h, w, f) = AddFOp( - O(n, h, w, f), MulFOp(I(n, h + kh, w + kw, c), K(f, kh, kw, c))); -} - ods_def: def conv_3d(I: f32(D, H, W), K: f32(KD, KH, KW)) -> (O: f32(D, H, W)) { O(d, h, w) = AddFOp( O(d, h, w), MulFOp(I(d + kd, h + kh, w + kw), K(kd, kh, kw))); } -ods_def: -def conv_3d_ndhwc(I: f32(N, D, H, W, C), K: f32(F, KD, KH, KW, C)) -> (O: f32(N, D, H, W, F)) { - O(n, d, h, w, f) = AddFOp( - O(n, d, h, w, f), - MulFOp(I(n, d + kd, h + kh, w + kw, c), K(f, kd, kh, kw, c))); -} - -ods_def: -def conv_3d_ncdhw(I: f32(N, C, D, H, W), K: f32(F, C, KD, KH, KW)) -> (O: f32(N, F, D, H, W)) { - O(n, f, d, h, w) = AddFOp( - O(n, f, d, h, w), - MulFOp(I(n, c, d + kd, h + kh, w + kw), K(f, c, kd, kh, kw))); -} - ods_def: def depthwise_conv_2d_input_nhwc_filter_hwcf (I: f32(N, IH, IW, CI), K: f32(KH, KW, CI, CO)) @@ -238,23 +115,6 @@ order of (`N`, `W`, `F`, `KW`, `C`). MulFOp(I(n, w * strides[0] + kw * dilations[0], c), K(kw, c, f))); } -ods_def: -def conv_1d_input_ncw_filter_wcf(I: f32(N, C, W), K: f32(KW, C, F)) -> (O: f32(N, F, W)) - attr(strides: 1xi64, dilations: 1xi64) -""" A 1-D convolution given NCW layout input and WCF layout filter. - -Computes a 1-D convolution given 3-D input and filter. The data layout -of input is NCW and the data layout of filter is WCF. - -The indexing maps for these three tensors contain 5 dimensions, following the -order of (`N`, `F`, `W`, `KW`, `C`). -""" -{ - O(n, f, w) = AddFOp( - O(n, f, w), - MulFOp(I(n, c, w * strides[0] + kw * dilations[0]), K(kw, c, f))); -} - ods_def: def conv_2d_input_nhwc_filter_hwcf(I: f32(N, H, W, C), K: f32(KH, KW, C, F)) -> (O: f32(N, H, W, F)) attr(strides: 2xi64, dilations: 2xi64) @@ -273,26 +133,6 @@ order of (`N`, `H`, `W`, `F`, `KH`, `KW`, `C`). K(kh, kw, c, f))); } -ods_def: -def conv_2d_input_nchw_filter_hwcf - (I: f32(N, C, H, W), K: f32(KH, KW, C, F)) - -> (O: f32(N, F, H, W)) - attr(strides: 2xi64, dilations: 2xi64) -""" A 2-D convolution given NCHW layout input and HWCF layout filter. - -Computes a 2-D convolution given 4-D input and filter. The data layout -of input is NCHW and the data layout of filter is HWCF. - -The indexing maps for these three tensors contain 7 dimensions, following the -order of (`N`, `F`, `H`, `W`, `KH`, `KW`, `C`). -""" -{ - O(n, f, h, w) = AddFOp( - O(n, f, h, w), MulFOp(I(n, c, h * strides[0] + kh * dilations[0], - w * strides[1] + kw * dilations[1]), - K(kh, kw, c, f))); -} - ods_def: def conv_3d_input_ndhwc_filter_dhwcf (I: f32(N, D, H, W, C), K: f32(KD, KH, KW, C, F)) @@ -313,24 +153,3 @@ order of (`N`, `D`, `H`, `W`, `F`, `KD`, `KH`, `KW`, `C`). w * strides[2] + kw * dilations[2], c), K(kd, kh, kw, c, f))); } - -ods_def: -def conv_3d_input_ncdhw_filter_dhwcf - (I: f32(N, C, D, H, W), K: f32(KD, KH, KW, C, F)) - -> (O: f32(N, F, D, H, W)) - attr(strides: 3xi64, dilations: 3xi64) -""" A 3-D convolution given NCDHW layout input and DHWCF layout filter. - -Computes a 3-D convolution given 5-D input and filter. The data layout -of input is NCDHW and the data layout of filter is DHWCF. - -The indexing maps for these three tensors contain 9 dimensions, following the -order of (`N`, `F`, `D`, `H`, `W`, `KD`, `KH`, `KW`, `C`). -""" -{ - O(n, f, d, h, w) = AddFOp( - O(n, f, d, h, w), MulFOp(I(n, c, d * strides[0] + kd * dilations[0], - h * strides[1] + kh * dilations[1], - w * strides[2] + kw * dilations[2]), - K(kd, kh, kw, c, f))); -} diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 9324fa2f443e1..abb4328b08f10 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1168,42 +1168,24 @@ void mlir::linalg::populateConvVectorizationPatterns( populateVectorizationPatterns(tiling, promotion, vectorization, tileSizes); - populateVectorizationPatterns(tiling, promotion, vectorization, - tileSizes); populateVectorizationPatterns( tiling, promotion, vectorization, tileSizes); - populateVectorizationPatterns(tiling, promotion, vectorization, - tileSizes); - populateVectorizationPatterns( - tiling, promotion, vectorization, tileSizes); - populateVectorizationPatterns(tiling, promotion, vectorization, tileSizes); - populateVectorizationPatterns(tiling, promotion, vectorization, - tileSizes); populateVectorizationPatterns( tiling, promotion, vectorization, tileSizes); populateVectorizationPatterns(tiling, promotion, vectorization, tileSizes); - populateVectorizationPatterns( - tiling, promotion, vectorization, tileSizes); populateVectorizationPatterns(tiling, promotion, vectorization, tileSizes); - populateVectorizationPatterns(tiling, promotion, - vectorization, tileSizes); populateVectorizationPatterns( tiling, promotion, vectorization, tileSizes); - populateVectorizationPatterns(tiling, promotion, - vectorization, tileSizes); - populateVectorizationPatterns( - tiling, promotion, vectorization, tileSizes); - patterns.push_back(std::move(tiling)); patterns.push_back(std::move(promotion)); patterns.push_back(std::move(vectorization)); diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py index b2bd4ce57f9b5..21ca35bf1036f 100644 --- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py @@ -177,24 +177,6 @@ def conv_2d_nhwc_hwcf( O[D.n, D.oh, D.ow, D.f] += cast( U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.c ]) * cast(U, K[D.kh, D.kw, D.c, D.f]) - -@linalg_structured_op -def depthwise_conv_2d_input_nhwc_filter_hwc_poly( - I=TensorDef(T1, S.N, S.IH, S.IW, S.C), - K=TensorDef(T2, S.KH, S.KW, S.C), - O=TensorDef(U, S.N, S.OH, S.OW, S.C, output=True), - strides=AttributeDef(S.SH, S.SW), - dilations=AttributeDef(S.DH, S.DW)): - """Performs depth-wise 2-D convolution. - - Numeric casting is performed on the operands to the inner multiply, promoting - them to the same data type as the accumulator/output. - """ - domain(D.n, D.oh, D.ow, D.kh, D.kw, D.c) - O[D.n, D.oh, D.ow, D.c] += cast( - U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, - D.c]) * cast(U, K[D.kh, D.kw, D.c]) - @linalg_structured_op def conv_2d_nhwc_hwcf_q( I=TensorDef(T1, S.N, S.IH, S.IW, S.C), diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir index 31966890dcf09..5260efd28d553 100644 --- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir +++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir @@ -182,140 +182,6 @@ func @conv_1d_input_nwc_filter_wcf(%input: memref, %filter: memref, %filter: memref, %output: memref) { - linalg.conv_1d_input_ncw_filter_wcf {dilations = dense<1> : tensor<1xi64>, - strides = dense<1> : tensor<1xi64>} - ins (%input, %filter: memref, memref) - outs (%output: memref) - return -} - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d2 + d3)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4, d1)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> - -// CHECK: func @conv_1d_input_ncw_filter_wcf -// CHECK: linalg.generic -// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]] -// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]} -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref, memref) -// CHECK-SAME: outs(%{{.+}} : memref) - -// CHECK: ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32) -// CHECK-NEXT: %[[MUL:.+]] = mulf %[[BBARG0]], %[[BBARG1]] : f32 -// CHECK-NEXT: %[[ADD:.+]] = addf %[[BBARG2]], %[[MUL]] : f32 -// CHECK-NEXT: linalg.yield %[[ADD]] : f32 - -// ----- - -func @conv_2d_input_nhwc_filter_hwcf(%input: memref, %filter: memref, %output: memref) { - linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<2> : tensor<2xi64>, - strides = dense<3> : tensor<2xi64>} - ins (%input, %filter: memref, memref) - outs (%output: memref) - return -} - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 3 + d4 * 2, d2 * 3 + d5 * 2, d6)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> - -// CHECK: func @conv_2d_input_nhwc_filter_hwcf - -// CHECK: linalg.generic -// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]] -// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "parallel"]} -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref, memref) -// CHECK-SAME: outs(%{{.+}} : memref) - -// CHECK: ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32) -// CHECK-NEXT: %[[MUL:.+]] = mulf %[[BBARG0]], %[[BBARG1]] : f32 -// CHECK-NEXT: %[[ADD:.+]] = addf %[[BBARG2]], %[[MUL]] : f32 -// CHECK-NEXT: linalg.yield %[[ADD]] : f32 - -// ----- - -func @conv_2d_input_nchw_filter_hwcf(%input: memref, %filter: memref, %output: memref) { - linalg.conv_2d_input_nchw_filter_hwcf {dilations = dense<1> : tensor<2xi64>, - strides = dense<1> : tensor<2xi64>} - ins (%input, %filter: memref, memref) - outs (%output: memref) - return -} - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d6, d2 + d4, d3 + d5)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d1)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)> - -// CHECK: func @conv_2d_input_nchw_filter_hwcf - -// CHECK: linalg.generic -// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]] -// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "parallel"]} -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref, memref) -// CHECK-SAME: outs(%{{.+}} : memref) - -// CHECK: ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32) -// CHECK-NEXT: %[[MUL:.+]] = mulf %[[BBARG0]], %[[BBARG1]] : f32 -// CHECK-NEXT: %[[ADD:.+]] = addf %[[BBARG2]], %[[MUL]] : f32 -// CHECK-NEXT: linalg.yield %[[ADD]] : f32 - -// ----- - -func @conv_3d_input_ndhwc_filter_dhwcf(%input: memref, %filter: memref, %output: memref) { - linalg.conv_3d_input_ndhwc_filter_dhwcf {dilations = dense<1> : tensor<3xi64>, - strides = dense<1> : tensor<3xi64>} - ins (%input, %filter: memref, memref) - outs (%output: memref) - return -} - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1 + d5, d2 + d6, d3 + d7, d8)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d5, d6, d7, d8, d4)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d2, d3, d4)> - -// CHECK: func @conv_3d_input_ndhwc_filter_dhwcf - -// CHECK: linalg.generic -// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]] -// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction", "parallel"]} -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref, memref) -// CHECK-SAME: outs(%{{.+}} : memref) - -// CHECK: ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32) -// CHECK-NEXT: %[[MUL:.+]] = mulf %[[BBARG0]], %[[BBARG1]] : f32 -// CHECK-NEXT: %[[ADD:.+]] = addf %[[BBARG2]], %[[MUL]] : f32 -// CHECK-NEXT: linalg.yield %[[ADD]] : f32 - -// ----- - -func @conv_3d_input_ncdhw_filter_dhwcf(%input: memref, %filter: memref, %output: memref) { - linalg.conv_3d_input_ncdhw_filter_dhwcf {dilations = dense<1> : tensor<3xi64>, - strides = dense<1> : tensor<3xi64>} - ins (%input, %filter: memref, memref) - outs (%output: memref) - return -} - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d8, d2 + d5, d3 + d6, d4 + d7)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d5, d6, d7, d8, d1)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d2, d3, d4)> - -// CHECK: func @conv_3d_input_ncdhw_filter_dhwcf - -// CHECK: linalg.generic -// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]] -// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction", "parallel"]} -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref, memref) -// CHECK-SAME: outs(%{{.+}} : memref) - -// CHECK: ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32) -// CHECK-NEXT: %[[MUL:.+]] = mulf %[[BBARG0]], %[[BBARG1]] : f32 -// CHECK-NEXT: %[[ADD:.+]] = addf %[[BBARG2]], %[[MUL]] : f32 -// CHECK-NEXT: linalg.yield %[[ADD]] : f32 - -// ----- - func @generalize_fill(%output: memref, %value : f32) { linalg.fill(%value, %output) : f32, memref return diff --git a/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir index 2d9f0932cde90..3e934d42012c4 100644 --- a/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir +++ b/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir @@ -30,36 +30,6 @@ func @generalize_matmul_tensor_i32(%A : tensor<16x8xi32>, %B: tensor<8x32xi32>, // ----- -func @generalize_depthwise_conv_2d_input_nhwc_filter_hwc_poly_f32(%input : tensor<1x4x16x1xf32>, %filter: tensor<2x2x1xf32>, %output: tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32> { - %0 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc_poly {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>} - ins(%input, %filter : tensor<1x4x16x1xf32>, tensor<2x2x1xf32>) outs(%output : tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32> - return %0: tensor<1x2x4x1xf32> -} - -// CHECK-LABEL: @generalize_depthwise_conv_2d_input_nhwc_filter_hwc_poly_f32 -// CHECK: ^{{.*}}(%[[IN_ARG:.+]]: f32, %[[FILTER_ARG:.+]]: f32, %[[OUT_ARG:.+]]: f32) -// CHECK-NEXT: %[[MUL:.+]] = mulf %[[IN_ARG]], %[[FILTER_ARG]] : f32 -// CHECK-NEXT: %[[ADD:.+]] = addf %[[OUT_ARG]], %[[MUL]] : f32 -// CHECK-NEXT: linalg.yield %[[ADD]] : f32 -// CHECK-NEXT: -> tensor<1x2x4x1xf32> - -// ----- - -func @generalize_depthwise_conv_2d_input_nhwc_filter_hwc_poly_i32(%input : tensor<1x4x16x1xi32>, %filter: tensor<2x2x1xi32>, %output: tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> { - %0 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc_poly {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>} - ins(%input, %filter : tensor<1x4x16x1xi32>, tensor<2x2x1xi32>) outs(%output : tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> - return %0: tensor<1x2x4x1xi32> -} - -// CHECK-LABEL: @generalize_depthwise_conv_2d_input_nhwc_filter_hwc_poly_i32 -// CHECK: ^{{.*}}(%[[IN_ARG:.+]]: i32, %[[FILTER_ARG:.+]]: i32, %[[OUT_ARG:.+]]: i32) -// CHECK-NEXT: %[[MUL:.+]] = muli %[[IN_ARG]], %[[FILTER_ARG]] : i32 -// CHECK-NEXT: %[[ADD:.+]] = addi %[[OUT_ARG]], %[[MUL]] : i32 -// CHECK-NEXT: linalg.yield %[[ADD]] : i32 -// CHECK-NEXT: -> tensor<1x2x4x1xi32> - -// ----- - func @generalize_pooling_nhwc_max_f32(%input : tensor<1x4x16x1xf32>, %shape: tensor<2x2xf32>, %output: tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32> { %0 = linalg.pooling_nhwc_max {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>} ins(%input, %shape : tensor<1x4x16x1xf32>, tensor<2x2xf32>) outs(%output : tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32> diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir index d19b87c487f90..2bcac817d7620 100644 --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -16,24 +16,6 @@ func @depthwise_conv_2d_input_nhwc_filter_hwcf_tensor(%input: tensor<2x4x5x2xf32 return %0 : tensor<2x3x4x2x3xf32> } -// CHECK-LABEL: func @conv_2d_nchw_tensor -func @conv_2d_nchw_tensor(%input: tensor<2x2x4x5xf32>, %filter: tensor<4x2x3x3xf32>) -> tensor<2x4x2x3xf32> { - %cst = constant 0.000000e+00 : f32 - %init = linalg.init_tensor [2, 4, 2, 3] : tensor<2x4x2x3xf32> - %fill = linalg.fill(%cst, %init) : f32, tensor<2x4x2x3xf32> -> tensor<2x4x2x3xf32> -// CHECK: %{{.+}} = linalg.conv_2d_nchw -// CHECK-SAME: {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<2x2x4x5xf32>, tensor<4x2x3x3xf32>) -// CHECK-SAME: outs(%{{.+}} : tensor<2x4x2x3xf32>) -> tensor<2x4x2x3xf32> -// CHECK: return %{{.+}} : tensor<2x4x2x3xf32> -// CHECK: } - %0 = linalg.conv_2d_nchw - {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} - ins(%input, %filter: tensor<2x2x4x5xf32>, tensor<4x2x3x3xf32>) - outs(%fill : tensor<2x4x2x3xf32>) -> tensor<2x4x2x3xf32> - return %0 : tensor<2x4x2x3xf32> -} - // CHECK-LABEL: func @depthwise_conv_2d_input_nhwc_filter_hwcf_memref func @depthwise_conv_2d_input_nhwc_filter_hwcf_memref(%input: memref<2x4x5x2xf32>, %filter: memref<2x2x2x3xf32>, %output: memref<2x3x4x2x3xf32>) { // CHECK: linalg.depthwise_conv_2d_input_nhwc_filter_hwcf @@ -174,38 +156,6 @@ func @conv_1d_input_nwc_filter_wcf(%input: memref, %filter: memref, %filter: tensor, %init: tensor) -> tensor { - // CHECK: %{{.+}} = linalg.conv_1d_input_ncw_filter_wcf - // CHECK-SAME: dilations = dense<1> : tensor<1xi64> - // CHECK-SAME: strides = dense<1> : tensor<1xi64> - // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor, tensor) - // CHECK-SAME: outs(%{{.+}} : tensor) -> tensor - %0 = linalg.conv_1d_input_ncw_filter_wcf {dilations = dense<1> : tensor<1xi64>, - strides = dense<1> : tensor<1xi64>} - ins (%input, %filter: tensor, tensor) - outs (%init: tensor) -> tensor - return %0 : tensor -} - -// ----- - -// CHECK-LABEL: func @conv_1d_input_ncw_filter_wcf -func @conv_1d_input_ncw_filter_wcf(%input: memref, %filter: memref, %output: memref) { - // CHECK: linalg.conv_1d_input_ncw_filter_wcf - // CHECK-SAME: dilations = dense<1> : tensor<1xi64> - // CHECK-SAME: strides = dense<1> : tensor<1xi64> - // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref, memref) - // CHECK-SAME: outs(%{{.+}} : memref) - linalg.conv_1d_input_ncw_filter_wcf {dilations = dense<1> : tensor<1xi64>, - strides = dense<1> : tensor<1xi64>} - ins (%input, %filter: memref, memref) - outs (%output: memref) - return -} - -// ----- - // CHECK-LABEL: func @conv_2d_input_nhwc_filter_hwcf func @conv_2d_input_nhwc_filter_hwcf(%input: tensor, %filter: tensor, %init: tensor) -> tensor { // CHECK: %{{.+}} = linalg.conv_2d_input_nhwc_filter_hwcf @@ -238,38 +188,6 @@ func @conv_2d_input_nhwc_filter_hwcf(%input: memref, %filter: memre // ----- -// CHECK-LABEL: func @conv_2d_input_nchw_filter_hwcf -func @conv_2d_input_nchw_filter_hwcf(%input: tensor, %filter: tensor, %init: tensor) -> tensor { - // CHECK: %{{.+}} = linalg.conv_2d_input_nchw_filter_hwcf - // CHECK-SAME: dilations = dense<1> : tensor<2xi64> - // CHECK-SAME: strides = dense<1> : tensor<2xi64> - // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor, tensor) - // CHECK-SAME: outs(%{{.+}} : tensor) -> tensor - %0 = linalg.conv_2d_input_nchw_filter_hwcf {dilations = dense<1> : tensor<2xi64>, - strides = dense<1> : tensor<2xi64>} - ins (%input, %filter: tensor, tensor) - outs (%init: tensor) -> tensor - return %0 : tensor -} - -// ----- - -// CHECK-LABEL: func @conv_2d_input_nchw_filter_hwcf -func @conv_2d_input_nchw_filter_hwcf(%input: memref, %filter: memref, %output: memref) { - // CHECK: linalg.conv_2d_input_nchw_filter_hwcf - // CHECK-SAME: dilations = dense<1> : tensor<2xi64> - // CHECK-SAME: strides = dense<1> : tensor<2xi64> - // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref, memref) - // CHECK-SAME: outs(%{{.+}} : memref) - linalg.conv_2d_input_nchw_filter_hwcf {dilations = dense<1> : tensor<2xi64>, - strides = dense<1> : tensor<2xi64>} - ins (%input, %filter: memref, memref) - outs (%output: memref) - return -} - -// ----- - // CHECK-LABEL: func @conv_3d_input_ndhwc_filter_dhwcf func @conv_3d_input_ndhwc_filter_dhwcf(%input: tensor, %filter: tensor, %init: tensor) -> tensor { // CHECK: %{{.+}} = linalg.conv_3d_input_ndhwc_filter_dhwcf @@ -302,38 +220,6 @@ func @conv_3d_input_ndhwc_filter_dhwcf(%input: memref, %filter: m // ----- -// CHECK-LABEL: func @conv_3d_input_ncdhw_filter_dhwcf -func @conv_3d_input_ncdhw_filter_dhwcf(%input: tensor, %filter: tensor, %init: tensor) -> tensor { - // CHECK: %{{.+}} = linalg.conv_3d_input_ncdhw_filter_dhwcf - // CHECK-SAME: dilations = dense<1> : tensor<3xi64> - // CHECK-SAME: strides = dense<1> : tensor<3xi64> - // CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor, tensor) - // CHECK-SAME: outs(%{{.+}} : tensor) -> tensor - %0 = linalg.conv_3d_input_ncdhw_filter_dhwcf {dilations = dense<1> : tensor<3xi64>, - strides = dense<1> : tensor<3xi64>} - ins (%input, %filter: tensor, tensor) - outs (%init: tensor) -> tensor - return %0 : tensor -} - -// ----- - -// CHECK-LABEL: func @conv_3d_input_ncdhw_filter_dhwcf -func @conv_3d_input_ncdhw_filter_dhwcf(%input: memref, %filter: memref, %output: memref) { - // CHECK: linalg.conv_3d_input_ncdhw_filter_dhwcf - // CHECK-SAME: dilations = dense<1> : tensor<3xi64> - // CHECK-SAME: strides = dense<1> : tensor<3xi64> - // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref, memref) - // CHECK-SAME: outs(%{{.+}} : memref) - linalg.conv_3d_input_ncdhw_filter_dhwcf {dilations = dense<1> : tensor<3xi64>, - strides = dense<1> : tensor<3xi64>} - ins (%input, %filter: memref, memref) - outs (%output: memref) - return -} - -// ----- - // CHECK-LABEL: func @pooling_nhwc_sum_tensor // CHECK: %{{.+}} = linalg.pooling_nhwc_sum // CHECK-SAME: dilations = dense<1> : tensor<2xi64> diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir deleted file mode 100644 index 897f360b8153f..0000000000000 --- a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir +++ /dev/null @@ -1,136 +0,0 @@ -// RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \ -// RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major_as_row_major anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major_as_row_major anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,16 vectorize" | \ - -// TODO: linalg.copy vectorization in the presence of permutation map fails. Enable when addressed. -// R_UN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.copy register-tile-sizes=4,16 vectorize" | \ - -// RUN: mlir-opt -canonicalize -convert-vector-to-scf -lower-affine -convert-linalg-to-loops | \ -// RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm -convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \ -// Activate to dump assembly -// R_UN: -dump-object-file -object-filename=/tmp/a.o \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ -// Use tee to both print to stderr and FileCheck -// RUN: tee -a /dev/stderr | FileCheck %s - -!elem_type_a = type f32 -!elem_type_b = type f32 -!elem_type_c = type f32 -!row_major_A = type memref<${M}x${K}x!elem_type_a> -!row_major_B = type memref<${K}x${N}x!elem_type_b> -!row_major_C = type memref<${M}x${N}x!elem_type_c> -!column_major_A = type memref<${K}x${M}x!elem_type_a> -!column_major_B = type memref<${N}x${K}x!elem_type_b> -!column_major_C = type memref<${N}x${M}x!elem_type_c> - -func @matmul_column_major_as_row_major( - %ca: !column_major_A, %cb: !column_major_B, %cc: !column_major_C, - %a: !row_major_A, %b: !row_major_B, %c: !row_major_C) -// TODO: activate manually for now. -// attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} -{ - linalg.copy(%ca, %a) {inputPermutation = affine_map<(i, j) -> (j, i)> } : !column_major_A, !row_major_A - linalg.copy(%cb, %b) {inputPermutation = affine_map<(i, j) -> (j, i)> } : !column_major_B, !row_major_B - linalg.matmul ins(%a, %b : !row_major_A, !row_major_B) - outs(%c: !row_major_C) - linalg.copy(%c, %cc) {inputPermutation = affine_map<(i, j) -> (j, i)> } : !row_major_C, !column_major_C - return -} - -func @print_perf(%iters: index, %total_time: f64) { - %c2 = constant 2 : index - %cM = constant ${M} : index - %cN = constant ${N} : index - %cK = constant ${K} : index - - %mn = muli %cM, %cN : index - %mnk = muli %mn, %cK : index - - // 2*M*N*K. - %flops_per_iter = muli %c2, %mnk : index - %flops = muli %iters, %flops_per_iter : index - %flops_i64 = index_cast %flops : index to i64 - %flops_f = sitofp %flops_i64 : i64 to f64 - %flops_per_s = divf %flops_f, %total_time : f64 - vector.print %flops_per_s : f64 - - return -} - -func @main() { - %f0 = constant 0.0 : !elem_type_c - %f1 = constant 1.0 : !elem_type_a - - %cA = memref.alloc() : !column_major_A - %cB = memref.alloc() : !column_major_B - %cC = memref.alloc() : !column_major_C - - linalg.fill(%f1, %cA) : !elem_type_a, !column_major_A - linalg.fill(%f1, %cB) : !elem_type_b, !column_major_B - linalg.fill(%f0, %cC) : !elem_type_c, !column_major_C - - %c0 = constant 0: index - %c1 = constant 1: index - %iters = constant ${ITERS}: index - - /// Run and dump performance for matmul_column_major as a row-major - %A = memref.alloc() : !row_major_A - %B = memref.alloc() : !row_major_B - %C = memref.alloc() : !row_major_C - %t_start_matmul_column_major_as_row_major = call @rtclock() : () -> f64 - scf.for %arg0 = %c0 to %iters step %c1 { - // linalg.matmul writes %C in place, need to reset it to zero every time. - // This is accounts for about 10-15% perf hit on small sizes. - // Once linalg on tensors is ready, fusing fill at the register level will - // be easy. - linalg.fill(%f0, %C) : !elem_type_c, !row_major_C - call @matmul_column_major_as_row_major(%cA, %cB, %cC, %A, %B, %C) : - (!column_major_A, !column_major_B, !column_major_C, - !row_major_A, !row_major_B, !row_major_C) -> () - } - %t_end_matmul_column_major_as_row_major = call @rtclock() : () -> f64 - %tmatmul_column_major_as_row_major = subf %t_end_matmul_column_major_as_row_major, %t_start_matmul_column_major_as_row_major: f64 - call @print_perf(%iters, %tmatmul_column_major_as_row_major) : (index, f64) -> () - - // CHECK: {{^0$}} - %cC_ref = memref.alloc() : !column_major_C - linalg.fill(%f0, %cC_ref) : !elem_type_c, !column_major_C - linalg.matmul_column_major ins(%cA, %cB : !column_major_A, !column_major_B) - outs(%cC_ref: !column_major_C) - %act1 = memref.cast %cC : !column_major_C to memref<*xf32> - %exp1 = memref.cast %cC_ref : !column_major_C to memref<*xf32> - %errors1 = call @verifyMemRefF32(%act1, %exp1) : (memref<*xf32>, memref<*xf32>) -> i64 - vector.print %errors1 : i64 - memref.dealloc %cC_ref : !column_major_C - - // CHECK: {{^0$}} - %C_ref = memref.alloc() : !row_major_C - linalg.fill(%f0, %C_ref) : !elem_type_c, !row_major_C - linalg.matmul ins(%A, %B : !row_major_A, !row_major_B) - outs(%C_ref: !row_major_C) - %act2 = memref.cast %C : !row_major_C to memref<*xf32> - %exp2 = memref.cast %C_ref : !row_major_C to memref<*xf32> - %errors2 = call @verifyMemRefF32(%act2, %exp2) : (memref<*xf32>, memref<*xf32>) -> i64 - vector.print %errors2 : i64 - memref.dealloc %C_ref : !row_major_C - - memref.dealloc %A : !row_major_A - memref.dealloc %B : !row_major_B - memref.dealloc %C : !row_major_C - - memref.dealloc %cA : !column_major_A - memref.dealloc %cB : !column_major_B - memref.dealloc %cC : !column_major_C - - return -} - -func private @rtclock() -> f64 -func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface } - -// TODO: init with random, run and check output. -// func private @fill_random_f32(memref<*xf32>) diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-ncw-filter-wcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-ncw-filter-wcf-call.mlir deleted file mode 100644 index 7e4b27679b5ce..0000000000000 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-ncw-filter-wcf-call.mlir +++ /dev/null @@ -1,70 +0,0 @@ -// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,4" -convert-linalg-to-loops -convert-scf-to-std \ -// RUN: -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,4" \ -// RUN: -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -func private @print_memref_f32(memref<*xf32>) - -// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f -func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref { - %buf = memref.alloc(%s1, %s2, %s3) : memref - linalg.fill(%f, %buf) : f32, memref - return %buf : memref -} - -func @conv_1d_input_ncw_filter_wcf(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.conv_1d_input_ncw_filter_wcf {dilations = dense<1> : tensor<1xi64>, - strides = dense<1> : tensor<1xi64>} - ins (%arg0, %arg1: memref, memref) - outs (%arg2: memref) - return -} - -func @main() { - %c0 = constant 0 : index - %c1 = constant 1 : index - %c3 = constant 3 : index - %c6 = constant 6 : index - %c8 = constant 8 : index - %f10 = constant 10.00000e+00 : f32 - %val = constant 2.00000e+00 : f32 - %zero = constant 0.00000e+00 : f32 - - %filter1D_ncw = call @alloc_3d_filled_f32(%c3, %c1, %c1, %val) : (index, index, index, f32) -> (memref) - %in1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c8, %val) : (index, index, index, f32) -> (memref) - %out1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c6, %zero) : (index, index, index, f32) -> (memref) - - memref.store %f10, %in1D_ncw[%c0, %c0, %c3] : memref - call @conv_1d_input_ncw_filter_wcf(%in1D_ncw, %filter1D_ncw, %out1D_ncw) : (memref, memref, memref) -> () - %out1D_ncw_ = memref.cast %out1D_ncw : memref to memref<*xf32> - call @print_memref_f32(%out1D_ncw_): (memref<*xf32>) -> () - - memref.dealloc %filter1D_ncw : memref - memref.dealloc %in1D_ncw : memref - memref.dealloc %out1D_ncw : memref - return -} - -// CHECK: Unranked Memref {{.*}} -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-SAME: [12, 28, 28, 28, 12, 12] -// CHECK-SAME: ] -// CHECK-SAME: ] diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir deleted file mode 100644 index 3a85d500460f7..0000000000000 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir +++ /dev/null @@ -1,68 +0,0 @@ -// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,4" -convert-linalg-to-loops -convert-scf-to-std \ -// RUN: -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,4" \ -// RUN: -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -func private @print_memref_f32(memref<*xf32>) - -// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f -func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref { - %buf = memref.alloc(%s1, %s2, %s3) : memref - linalg.fill(%f, %buf) : f32, memref - return %buf : memref -} - -func @conv_1d_ncw(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.conv_1d_ncw ins (%arg0, %arg1: memref, memref) - outs (%arg2: memref) - return -} - -func @main() { - %c0 = constant 0 : index - %c1 = constant 1 : index - %c3 = constant 3 : index - %c6 = constant 6 : index - %c8 = constant 8 : index - %f10 = constant 10.00000e+00 : f32 - %val = constant 2.00000e+00 : f32 - %zero = constant 0.00000e+00 : f32 - - %filter1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c3, %val) : (index, index, index, f32) -> (memref) - %in1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c8, %val) : (index, index, index, f32) -> (memref) - %out1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c6, %zero) : (index, index, index, f32) -> (memref) - - memref.store %f10, %in1D_ncw[%c0, %c0, %c3] : memref - call @conv_1d_ncw(%in1D_ncw, %filter1D_ncw, %out1D_ncw) : (memref, memref, memref) -> () - %out1D_ncw_ = memref.cast %out1D_ncw : memref to memref<*xf32> - call @print_memref_f32(%out1D_ncw_): (memref<*xf32>) -> () - - memref.dealloc %filter1D_ncw : memref - memref.dealloc %in1D_ncw : memref - memref.dealloc %out1D_ncw : memref - return -} - -// CHECK: Unranked Memref {{.*}} -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-SAME: [12, 28, 28, 28, 12, 12] -// CHECK-SAME: ] -// CHECK-SAME: ] diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir deleted file mode 100644 index 68f890c9dabb2..0000000000000 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir +++ /dev/null @@ -1,79 +0,0 @@ -// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,4" -convert-linalg-to-loops -convert-scf-to-std \ -// RUN: -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,4" \ -// RUN: -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -func private @print_memref_f32(memref<*xf32>) - -// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f -func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref { - %buf = memref.alloc(%s1, %s2, %s3) : memref - linalg.fill(%f, %buf) : f32, memref - return %buf : memref -} - -func @conv_1d_nwc(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.conv_1d_nwc ins (%arg0, %arg1: memref, memref) - outs (%arg2: memref) - return -} - -func @main() { - %c0 = constant 0 : index - %c1 = constant 1 : index - %c3 = constant 3 : index - %c6 = constant 6 : index - %c8 = constant 8 : index - %f10 = constant 10.00000e+00 : f32 - %val = constant 2.00000e+00 : f32 - %zero = constant 0.00000e+00 : f32 - - %filter1D_nwc = call @alloc_3d_filled_f32(%c1, %c3, %c1, %val) : (index, index, index, f32) -> (memref) - %in1D_nwc = call @alloc_3d_filled_f32(%c3, %c8, %c1, %val) : (index, index, index, f32) -> (memref) - %out1D_nwc = call @alloc_3d_filled_f32(%c3, %c6, %c1, %zero) : (index, index, index, f32) -> (memref) - - memref.store %f10, %in1D_nwc[%c0, %c3, %c0] : memref - call @conv_1d_nwc(%in1D_nwc, %filter1D_nwc, %out1D_nwc) : (memref, memref, memref) -> () - %out1D_nwc_ = memref.cast %out1D_nwc : memref to memref<*xf32> - call @print_memref_f32(%out1D_nwc_): (memref<*xf32>) -> () - - memref.dealloc %filter1D_nwc : memref - memref.dealloc %in1D_nwc : memref - memref.dealloc %out1D_nwc : memref - return -} - -// CHECK: Unranked Memref {{.*}} -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-SAME: [12], -// CHECK-COUNT-3: [28], -// CHECK-NEXT: [12], -// CHECK-NEXT: [12] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-5: [12], -// CHECK-NEXT: [12] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-5: [12], -// CHECK-NEXT: [12] -// CHECK-SAME: ] -// CHECK-SAME: ] diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nchw-filter-hwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nchw-filter-hwcf-call.mlir deleted file mode 100644 index 32e548cbb3240..0000000000000 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nchw-filter-hwcf-call.mlir +++ /dev/null @@ -1,83 +0,0 @@ -// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,0,4,4" -convert-linalg-to-loops -convert-scf-to-std \ -// RUN: -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,0,4,4" \ -// RUN: -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -func private @print_memref_f32(memref<*xf32>) - -// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f -func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref { - %buf = memref.alloc(%s1, %s2, %s3, %s4) : memref - linalg.fill(%f, %buf) : f32, memref - return %buf : memref -} - -func @conv_2d_input_nchw_filter_hwcf(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.conv_2d_input_nchw_filter_hwcf {dilations = dense<1> : tensor<2xi64>, - strides = dense<1> : tensor<2xi64>} - ins (%arg0, %arg1: memref, memref) - outs (%arg2: memref) - return -} - -func @main() { - %c0 = constant 0 : index - %c1 = constant 1 : index - %c3 = constant 3 : index - %c6 = constant 6 : index - %c8 = constant 8 : index - %f10 = constant 10.00000e+00 : f32 - %val = constant 2.00000e+00 : f32 - %zero = constant 0.00000e+00 : f32 - - %filter2D_nchw = call @alloc_4d_filled_f32(%c3, %c3, %c1, %c1, %val) : (index, index, index, index, f32) -> (memref) - %in2D_nchw = call @alloc_4d_filled_f32(%c3, %c1, %c8, %c8, %val) : (index, index, index, index, f32) -> (memref) - %out2D_nchw = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (memref) - - memref.store %f10, %in2D_nchw[%c0, %c0, %c0, %c3] : memref - call @conv_2d_input_nchw_filter_hwcf(%in2D_nchw, %filter2D_nchw, %out2D_nchw) : (memref, memref, memref) -> () - %out2D_nchw_ = memref.cast %out2D_nchw : memref to memref<*xf32> - call @print_memref_f32(%out2D_nchw_): (memref<*xf32>) -> () - - memref.dealloc %filter2D_nchw : memref - memref.dealloc %in2D_nchw : memref - memref.dealloc %out2D_nchw : memref - return -} - -// CHECK: Unranked Memref {{.*}} -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-SAME: [ -// CHECK-SAME: [36, 52, 52, 52, 36, 36], -// CHECK-COUNT-5: [36, 36, 36, 36, 36, 36] -// CHECK-SAME: ] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-COUNT-6: [36, 36, 36, 36, 36, 36] -// CHECK-SAME: ] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-COUNT-6: [36, 36, 36, 36, 36, 36] -// CHECK-SAME: ] -// CHECK-SAME: ] -// CHECK-SAME: ] diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir deleted file mode 100644 index 5c75aa4fc6dd6..0000000000000 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir +++ /dev/null @@ -1,83 +0,0 @@ -// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,0,4,4" -convert-linalg-to-loops -convert-scf-to-std \ -// RUN: -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,0,4,4" \ -// RUN: -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -func private @print_memref_f32(memref<*xf32>) - -// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f -func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref { - %buf = memref.alloc(%s1, %s2, %s3, %s4) : memref - linalg.fill(%f, %buf) : f32, memref - return %buf : memref -} - -func @conv_2d_nchw(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.conv_2d_nchw - {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} - ins (%arg0, %arg1: memref, memref) - outs (%arg2: memref) - return -} - -func @main() { - %c0 = constant 0 : index - %c1 = constant 1 : index - %c3 = constant 3 : index - %c6 = constant 6 : index - %c8 = constant 8 : index - %f10 = constant 10.00000e+00 : f32 - %val = constant 2.00000e+00 : f32 - %zero = constant 0.00000e+00 : f32 - - %filter2D_nchw = call @alloc_4d_filled_f32(%c1, %c1, %c3, %c3, %val) : (index, index, index, index, f32) -> (memref) - %in2D_nchw = call @alloc_4d_filled_f32(%c3, %c1, %c8, %c8, %val) : (index, index, index, index, f32) -> (memref) - %out2D_nchw = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (memref) - - memref.store %f10, %in2D_nchw[%c0, %c0, %c0, %c3] : memref - call @conv_2d_nchw(%in2D_nchw, %filter2D_nchw, %out2D_nchw) : (memref, memref, memref) -> () - %out2D_nchw_ = memref.cast %out2D_nchw : memref to memref<*xf32> - call @print_memref_f32(%out2D_nchw_): (memref<*xf32>) -> () - - memref.dealloc %filter2D_nchw : memref - memref.dealloc %in2D_nchw : memref - memref.dealloc %out2D_nchw : memref - return -} - -// CHECK: Unranked Memref {{.*}} -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-SAME: [ -// CHECK-SAME: [36, 52, 52, 52, 36, 36], -// CHECK-COUNT-5: [36, 36, 36, 36, 36, 36] -// CHECK-SAME: ] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-COUNT-6: [36, 36, 36, 36, 36, 36] -// CHECK-SAME: ] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-COUNT-6: [36, 36, 36, 36, 36, 36] -// CHECK-SAME: ] -// CHECK-SAME: ] -// CHECK-SAME: ] diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir deleted file mode 100644 index 51326560b59e1..0000000000000 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir +++ /dev/null @@ -1,127 +0,0 @@ -// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,3,2" -convert-linalg-to-loops -convert-scf-to-std \ -// RUN: -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,3,2" \ -// RUN: -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -func private @print_memref_f32(memref<*xf32>) - -// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f -func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref { - %buf = memref.alloc(%s1, %s2, %s3, %s4) : memref - linalg.fill(%f, %buf) : f32, memref - return %buf : memref -} - -func @conv_2d_nhwc(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.conv_2d_nhwc ins (%arg0, %arg1: memref, memref) - outs (%arg2: memref) - return -} - -func @main() { - %c0 = constant 0 : index - %c1 = constant 1 : index - %c3 = constant 3 : index - %c6 = constant 6 : index - %c8 = constant 8 : index - %f10 = constant 10.00000e+00 : f32 - %val = constant 2.00000e+00 : f32 - %zero = constant 0.00000e+00 : f32 - - %filter2D_nhwc = call @alloc_4d_filled_f32(%c1, %c3, %c3, %c3, %val) :(index, index, index, index, f32) -> (memref) - %in2D_nhwc = call @alloc_4d_filled_f32(%c3, %c8, %c8, %c3, %val) : (index, index, index, index, f32) -> (memref) - %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c6, %c6, %c1, %zero) : (index, index, index, index, f32) -> (memref) - - memref.store %f10, %in2D_nhwc[%c0, %c0, %c3, %c0] : memref - call @conv_2d_nhwc(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (memref, memref, memref) -> () - %out2D_nhwc_ = memref.cast %out2D_nhwc : memref to memref<*xf32> - call @print_memref_f32(%out2D_nhwc_): (memref<*xf32>) -> () - - memref.dealloc %filter2D_nhwc : memref - memref.dealloc %in2D_nhwc : memref - memref.dealloc %out2D_nhwc : memref - return -} - -// CHECK: Unranked Memref {{.*}} -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-SAME: [ -// CHECK-SAME: [108], -// CHECK-COUNT-3: [124], -// CHECK-COUNT-2: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ] -// CHECK-SAME: ] -// CHECK-SAME: ] diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ncdhw-filter-dhwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ncdhw-filter-dhwcf-call.mlir deleted file mode 100644 index 5f063543a584e..0000000000000 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ncdhw-filter-dhwcf-call.mlir +++ /dev/null @@ -1,90 +0,0 @@ -// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,5,5,5" -convert-linalg-to-loops -convert-scf-to-std \ -// RUN: -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,5,5,5" \ -// RUN: -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -func private @print_memref_f32(memref<*xf32>) - -// Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f -func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> memref { - %buf = memref.alloc(%s1, %s2, %s3, %s4, %s5) : memref - linalg.fill(%f, %buf) : f32, memref - return %buf : memref -} - -func @conv_3d_input_ncdhw_filter_dhwcf(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.conv_3d_input_ncdhw_filter_dhwcf {dilations = dense<1> : tensor<3xi64>, - strides = dense<1> : tensor<3xi64>} - ins (%arg0, %arg1: memref, memref) - outs (%arg2: memref) - return -} - -func @main() { - %c0 = constant 0 : index - %c1 = constant 1 : index - %c3 = constant 3 : index - %c6 = constant 6 : index - %c8 = constant 8 : index - %f10 = constant 10.00000e+00 : f32 - %val = constant 2.00000e+00 : f32 - %zero = constant 0.00000e+00 : f32 - - %filter3D_ncdhw = call @alloc_5d_filled_f32(%c3, %c3, %c3, %c1, %c1, %val) : (index, index, index, index, index, f32) -> (memref) - %in3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c8, %c8, %c8, %val) : (index, index, index, index, index, f32) -> (memref) - %out3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c6, %c6, %c6, %zero) : (index, index, index, index, index, f32) -> (memref) - - memref.store %f10, %in3D_ncdhw[%c0, %c0, %c0, %c0, %c3] : memref - call @conv_3d_input_ncdhw_filter_dhwcf(%in3D_ncdhw, %filter3D_ncdhw, %out3D_ncdhw) : (memref, memref, memref) -> () - %out3D_ncdhw_ = memref.cast %out3D_ncdhw : memref to memref<*xf32> - call @print_memref_f32(%out3D_ncdhw_): (memref<*xf32>) -> () - - memref.dealloc %filter3D_ncdhw : memref - memref.dealloc %in3D_ncdhw : memref - memref.dealloc %out3D_ncdhw : memref - return -} - -// CHECK: Unranked Memref {{.*}} -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-SAME: [ -// CHECK-SAME: [ -// CHECK-SAME: [108, 124, 124, 124, 108, 108], -// CHECK-COUNT-5: [108, 108, 108, 108, 108, 108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] -// CHECK-SAME: ] -// CHECK-SAME: ] -// CHECK-SAME: ] -// CHECK-SAME: ] diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir deleted file mode 100644 index cb7b49eec4d22..0000000000000 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir +++ /dev/null @@ -1,88 +0,0 @@ -// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,5,5,5" -convert-linalg-to-loops -convert-scf-to-std \ -// RUN: -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,5,5,5" \ -// RUN: -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -func private @print_memref_f32(memref<*xf32>) - -// Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f -func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> memref { - %buf = memref.alloc(%s1, %s2, %s3, %s4, %s5) : memref - linalg.fill(%f, %buf) : f32, memref - return %buf : memref -} - -func @conv_3d_ncdhw(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.conv_3d_ncdhw ins (%arg0, %arg1: memref, memref) - outs (%arg2: memref) - return -} - -func @main() { - %c0 = constant 0 : index - %c1 = constant 1 : index - %c3 = constant 3 : index - %c6 = constant 6 : index - %c8 = constant 8 : index - %f10 = constant 10.00000e+00 : f32 - %val = constant 2.00000e+00 : f32 - %zero = constant 0.00000e+00 : f32 - - %filter3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c3, %c3, %c3, %val) : (index, index, index, index, index, f32) -> (memref) - %in3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c8, %c8, %c8, %val) : (index, index, index, index, index, f32) -> (memref) - %out3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c6, %c6, %c6, %zero) : (index, index, index, index, index, f32) -> (memref) - - memref.store %f10, %in3D_ncdhw[%c0, %c0, %c0, %c0, %c3] : memref - call @conv_3d_ncdhw(%in3D_ncdhw, %filter3D_ncdhw, %out3D_ncdhw) : (memref, memref, memref) -> () - %out3D_ncdhw_ = memref.cast %out3D_ncdhw : memref to memref<*xf32> - call @print_memref_f32(%out3D_ncdhw_): (memref<*xf32>) -> () - - memref.dealloc %filter3D_ncdhw : memref - memref.dealloc %in3D_ncdhw : memref - memref.dealloc %out3D_ncdhw : memref - return -} - -// CHECK: Unranked Memref {{.*}} -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-SAME: [ -// CHECK-SAME: [ -// CHECK-SAME: [108, 124, 124, 124, 108, 108], -// CHECK-COUNT-5: [108, 108, 108, 108, 108, 108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108, 108, 108, 108, 108, 108] -// CHECK-SAME: ] -// CHECK-SAME: ] -// CHECK-SAME: ] -// CHECK-SAME: ] diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir deleted file mode 100644 index f761088b22811..0000000000000 --- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir +++ /dev/null @@ -1,190 +0,0 @@ -// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,5,5,5" -convert-linalg-to-loops -convert-scf-to-std \ -// RUN: -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,5,5,5" \ -// RUN: -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \ -// RUN: mlir-cpu-runner -e main -entry-point-result=void \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: | FileCheck %s - -func private @print_memref_f32(memref<*xf32>) - -// Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f -func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> memref { - %buf = memref.alloc(%s1, %s2, %s3, %s4, %s5) : memref - linalg.fill(%f, %buf) : f32, memref - return %buf : memref -} - -func @conv_3d_ndhwc(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.conv_3d_ndhwc ins (%arg0, %arg1: memref, memref) - outs (%arg2: memref) - return -} - - -func @main() { - %c0 = constant 0 : index - %c1 = constant 1 : index - %c3 = constant 3 : index - %c6 = constant 6 : index - %c8 = constant 8 : index - %f10 = constant 10.00000e+00 : f32 - %val = constant 2.00000e+00 : f32 - %zero = constant 0.00000e+00 : f32 - - %filter3D_ndhwc = call @alloc_5d_filled_f32(%c1, %c3, %c3, %c3, %c1, %val) : (index, index, index, index, index, f32) -> (memref) - %in3D_ndhwc = call @alloc_5d_filled_f32(%c1, %c8, %c8, %c8, %c1, %val) : (index, index, index, index, index, f32) -> (memref) - %out3D_ndhwc = call @alloc_5d_filled_f32(%c1, %c6, %c6, %c6, %c1, %zero) : (index, index, index, index, index, f32) -> (memref) - - memref.store %f10, %in3D_ndhwc[%c0, %c0, %c0, %c3, %c0] : memref - call @conv_3d_ndhwc(%in3D_ndhwc, %filter3D_ndhwc, %out3D_ndhwc) : (memref, memref, memref) -> () - %out3D_ndhwc_ = memref.cast %out3D_ndhwc : memref to memref<*xf32> - call @print_memref_f32(%out3D_ndhwc_): (memref<*xf32>) -> () - - memref.dealloc %filter3D_ndhwc : memref - memref.dealloc %in3D_ndhwc : memref - memref.dealloc %out3D_ndhwc : memref - return -} - -// CHECK: Unranked Memref {{.*}} -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-SAME: [ -// CHECK-SAME: [ -// CHECK-SAME: [108], -// CHECK-COUNT-3: [124], -// CHECK-COUNT-2: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-SAME: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ], -// CHECK-NEXT: [ -// CHECK-COUNT-6: [108] -// CHECK-SAME: ] -// CHECK-SAME: ] -// CHECK-SAME: ] -// CHECK-SAME: ] diff --git a/mlir/test/python/integration/dialects/linalg/opsrun.py b/mlir/test/python/integration/dialects/linalg/opsrun.py index f730e637b6cdd..b6e5d98c0e256 100644 --- a/mlir/test/python/integration/dialects/linalg/opsrun.py +++ b/mlir/test/python/integration/dialects/linalg/opsrun.py @@ -244,71 +244,6 @@ def fill_on_buffers(min, max, seed, out): test_fill_generic() -def test_conv_builtin(): - with Context() as ctx, Location.unknown(): - module = Module.create() - f64 = F64Type.get() - i32 = IntegerType.get_signless(32) - with InsertionPoint(module.body): - - @builtin.FuncOp.from_py_func( - MemRefType.get((1, 4, 16, 1), f64), MemRefType.get((2, 2, 1), f64), - MemRefType.get((1, 2, 4, 1), i32)) - def conv_on_buffers(input, filter, output): - linalg.depthwise_conv_2d_input_nhwc_filter_hwc_poly( - input, filter, outs=[output], strides=[2, 4], dilations=[1, 2]) - - execution_engine = ExecutionEngine(transform(module, conv_boiler)) - - # TODO: FFI-based solution to allow testing and printing with python code. - # Prepare arguments: one result i32. - # Arguments must be passed as pointers. - c_int_p = ctypes.c_int * 1 - res = c_int_p(-1) - execution_engine.invoke("main", res) - - log("RESULT: ", res[0]) - # CHECK: RESULT: 8 - - -test_conv_builtin() - - -def test_conv_generic(): - with Context() as ctx, Location.unknown(): - module = Module.create() - f64 = F64Type.get() - i32 = IntegerType.get_signless(32) - with InsertionPoint(module.body): - - @builtin.FuncOp.from_py_func( - MemRefType.get((1, 4, 16, 1), f64), MemRefType.get((2, 2, 1), f64), - MemRefType.get((1, 2, 4, 1), i32)) - def conv_on_buffers(input, filter, output): - linalg.depthwise_conv_2d_input_nhwc_filter_hwc_poly( - input, - filter, - outs=[output], - strides=[2, 4], - dilations=[1, 2], - emit_generic=True) - - execution_engine = ExecutionEngine(transform(module, conv_boiler)) - - # TODO: FFI-based solution to allow testing and printing with python code. - # Prepare arguments: one result i32. - # Arguments must be passed as pointers. - c_int_p = ctypes.c_int * 1 - res = c_int_p(-1) - execution_engine.invoke("main", res) - - log("RESULT: ", res[0]) - # CHECK: RESULT: 8 - - -test_conv_generic() - - def test_max_pooling_builtin(): with Context() as ctx, Location.unknown(): module = Module.create() From 735a59047149f104f42c59d3c4d9e847fe956a1b Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Sat, 14 Aug 2021 17:46:47 +0200 Subject: [PATCH 138/700] [MemorySSA] Remove -enable-mssa-loop-dependency option This option has been enabled by default for quite a while now. The practical impact of removing the option is that MSSA use cannot be disabled in default pipelines (both LPM and NPM) and in manual LPM invocations. NPM can still choose to enable/disable MSSA using loop vs loop-mssa. The next step will be to require MSSA for LICM and drop the AST-based implementation entirely. Differential Revision: https://reviews.llvm.org/D108075 --- llvm/include/llvm/Analysis/MemorySSA.h | 3 -- llvm/lib/Analysis/MemorySSA.cpp | 4 -- llvm/lib/Passes/PassBuilder.cpp | 16 +++---- llvm/lib/Transforms/Scalar/LICM.cpp | 10 ++--- .../Transforms/Scalar/LoopInstSimplify.cpp | 17 +++---- llvm/lib/Transforms/Scalar/LoopRotation.cpp | 15 +++---- .../lib/Transforms/Scalar/LoopSimplifyCFG.cpp | 21 +++------ llvm/lib/Transforms/Scalar/LoopUnswitch.cpp | 19 +++----- .../Transforms/Scalar/SimpleLoopUnswitch.cpp | 23 ++++------ llvm/lib/Transforms/Utils/LoopSimplify.cpp | 13 +++--- llvm/test/Analysis/BasicAA/store-promote.ll | 1 - llvm/test/Analysis/MemorySSA/pr42294.ll | 2 - llvm/test/CodeGen/PowerPC/pr35688.ll | 45 ++++++------------- llvm/test/Transforms/LICM/argmemonly-call.ll | 1 - llvm/test/Transforms/LICM/atomics.ll | 1 - llvm/test/Transforms/LICM/guards.ll | 1 - llvm/test/Transforms/LICM/pr38513.ll | 2 +- llvm/test/Transforms/LICM/promote-order.ll | 1 - llvm/test/Transforms/LICM/read-only-calls.ll | 1 - llvm/test/Transforms/LICM/store-hoisting.ll | 1 - .../Transforms/Scalar/LoopPassManagerTest.cpp | 12 ++--- 21 files changed, 67 insertions(+), 142 deletions(-) diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h index f40b99968fd3a..024d2c4b003c9 100644 --- a/llvm/include/llvm/Analysis/MemorySSA.h +++ b/llvm/include/llvm/Analysis/MemorySSA.h @@ -106,9 +106,6 @@ namespace llvm { -/// Enables memory ssa as a dependency for loop passes. -extern cl::opt EnableMSSALoopDependency; - class AllocaInst; class Function; class Instruction; diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp index b402b0467f5de..dc830ca7a812a 100644 --- a/llvm/lib/Analysis/MemorySSA.cpp +++ b/llvm/lib/Analysis/MemorySSA.cpp @@ -90,10 +90,6 @@ bool llvm::VerifyMemorySSA = true; #else bool llvm::VerifyMemorySSA = false; #endif -/// Enables memory ssa as a dependency for loop passes in legacy pass manager. -cl::opt llvm::EnableMSSALoopDependency( - "enable-mssa-loop-dependency", cl::Hidden, cl::init(true), - cl::desc("Enable MemorySSA dependency for loop pass manager")); static cl::opt VerifyMemorySSAX("verify-memoryssa", cl::location(VerifyMemorySSA), diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index cdf0a732e6708..6ac722c335bd8 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -617,7 +617,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass( RequireAnalysisPass()); FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), - EnableMSSALoopDependency, + /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); @@ -791,7 +791,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass( RequireAnalysisPass()); FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), - EnableMSSALoopDependency, + /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); @@ -848,7 +848,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(DSEPass()); FPM.addPass(createFunctionToLoopPassAdaptor( LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), - EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true)); + /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); FPM.addPass(CoroElidePass()); @@ -1246,9 +1246,9 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, OptimizationLevel::O3)); FPM.addPass( RequireAnalysisPass()); - FPM.addPass(createFunctionToLoopPassAdaptor( - std::move(LPM), EnableMSSALoopDependency, - /*UseBlockFrequencyInfo=*/true)); + FPM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, + /*UseBlockFrequencyInfo=*/true)); FPM.addPass(SimplifyCFGPass()); FPM.addPass(InstCombinePass()); } @@ -1307,7 +1307,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, RequireAnalysisPass()); FPM.addPass(createFunctionToLoopPassAdaptor( LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), - EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true)); + /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); } // Now that we've vectorized and unrolled loops, we may have more refined @@ -1828,7 +1828,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, FunctionPassManager MainFPM; MainFPM.addPass(createFunctionToLoopPassAdaptor( LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), - EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true)); + /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); if (RunNewGVN) MainFPM.addPass(NewGVNPass()); diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 33c5abf0fe302..9e5e7d2a5935b 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -229,9 +229,7 @@ struct LegacyLICMPass : public LoopPass { << L->getHeader()->getNameOrAsOperand() << "\n"); auto *SE = getAnalysisIfAvailable(); - MemorySSA *MSSA = EnableMSSALoopDependency - ? (&getAnalysis().getMSSA()) - : nullptr; + MemorySSA *MSSA = &getAnalysis().getMSSA(); bool hasProfileData = L->getHeader()->getParent()->hasProfileData(); BlockFrequencyInfo *BFI = hasProfileData ? &getAnalysis().getBFI() @@ -258,10 +256,8 @@ struct LegacyLICMPass : public LoopPass { AU.addPreserved(); AU.addPreserved(); AU.addRequired(); - if (EnableMSSALoopDependency) { - AU.addRequired(); - AU.addPreserved(); - } + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); getLoopAnalysisUsage(AU); LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU); diff --git a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index 3153a8721193b..61cb3e94d6bfe 100644 --- a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -195,15 +195,10 @@ class LoopInstSimplifyLegacyPass : public LoopPass { const TargetLibraryInfo &TLI = getAnalysis().getTLI( *L->getHeader()->getParent()); - MemorySSA *MSSA = nullptr; - Optional MSSAU; - if (EnableMSSALoopDependency) { - MSSA = &getAnalysis().getMSSA(); - MSSAU = MemorySSAUpdater(MSSA); - } + MemorySSA *MSSA = &getAnalysis().getMSSA(); + MemorySSAUpdater MSSAU(MSSA); - return simplifyLoopInst(*L, DT, LI, AC, TLI, - MSSAU.hasValue() ? MSSAU.getPointer() : nullptr); + return simplifyLoopInst(*L, DT, LI, AC, TLI, &MSSAU); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -211,10 +206,8 @@ class LoopInstSimplifyLegacyPass : public LoopPass { AU.addRequired(); AU.addRequired(); AU.setPreservesCFG(); - if (EnableMSSALoopDependency) { - AU.addRequired(); - AU.addPreserved(); - } + AU.addRequired(); + AU.addPreserved(); getLoopAnalysisUsage(AU); } }; diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp index 6d5b19443c767..5ba137b1c85fb 100644 --- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -99,8 +99,7 @@ class LoopRotateLegacyPass : public LoopPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); - if (EnableMSSALoopDependency) - AU.addPreserved(); + AU.addPreserved(); getLoopAnalysisUsage(AU); // Lazy BFI and BPI are marked as preserved here so LoopRotate @@ -121,13 +120,11 @@ class LoopRotateLegacyPass : public LoopPass { auto &SE = getAnalysis().getSE(); const SimplifyQuery SQ = getBestSimplifyQuery(*this, F); Optional MSSAU; - if (EnableMSSALoopDependency) { - // Not requiring MemorySSA and getting it only if available will split - // the loop pass pipeline when LoopRotate is being run first. - auto *MSSAA = getAnalysisIfAvailable(); - if (MSSAA) - MSSAU = MemorySSAUpdater(&MSSAA->getMSSA()); - } + // Not requiring MemorySSA and getting it only if available will split + // the loop pass pipeline when LoopRotate is being run first. + auto *MSSAA = getAnalysisIfAvailable(); + if (MSSAA) + MSSAU = MemorySSAUpdater(&MSSAA->getMSSA()); // Vectorization requires loop-rotation. Use default threshold for loops the // user explicitly marked for vectorization, even when header duplication is // disabled. diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index cc6d112208079..6fa736426e459 100644 --- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -733,27 +733,20 @@ class LoopSimplifyCFGLegacyPass : public LoopPass { DominatorTree &DT = getAnalysis().getDomTree(); LoopInfo &LI = getAnalysis().getLoopInfo(); ScalarEvolution &SE = getAnalysis().getSE(); - Optional MSSAU; - if (EnableMSSALoopDependency) { - MemorySSA *MSSA = &getAnalysis().getMSSA(); - MSSAU = MemorySSAUpdater(MSSA); - if (VerifyMemorySSA) - MSSA->verifyMemorySSA(); - } + MemorySSA *MSSA = &getAnalysis().getMSSA(); + MemorySSAUpdater MSSAU(MSSA); + if (VerifyMemorySSA) + MSSA->verifyMemorySSA(); bool DeleteCurrentLoop = false; - bool Changed = simplifyLoopCFG( - *L, DT, LI, SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, - DeleteCurrentLoop); + bool Changed = simplifyLoopCFG(*L, DT, LI, SE, &MSSAU, DeleteCurrentLoop); if (DeleteCurrentLoop) LPM.markLoopAsDeleted(*L); return Changed; } void getAnalysisUsage(AnalysisUsage &AU) const override { - if (EnableMSSALoopDependency) { - AU.addRequired(); - AU.addPreserved(); - } + AU.addRequired(); + AU.addPreserved(); AU.addPreserved(); getLoopAnalysisUsage(AU); } diff --git a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index 9a854ff802465..9770465545473 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -232,10 +232,8 @@ namespace { AU.addPreserved(); AU.addRequired(); AU.addRequired(); - if (EnableMSSALoopDependency) { - AU.addRequired(); - AU.addPreserved(); - } + AU.addRequired(); + AU.addPreserved(); if (HasBranchDivergence) AU.addRequired(); getLoopAnalysisUsage(AU); @@ -539,11 +537,8 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) { LPM = &LPMRef; DT = &getAnalysis().getDomTree(); AA = &getAnalysis().getAAResults(); - if (EnableMSSALoopDependency) { - MSSA = &getAnalysis().getMSSA(); - MSSAU = std::make_unique(MSSA); - assert(DT && "Cannot update MemorySSA without a valid DomTree."); - } + MSSA = &getAnalysis().getMSSA(); + MSSAU = std::make_unique(MSSA); CurrentLoop = L; Function *F = CurrentLoop->getHeader()->getParent(); @@ -551,19 +546,19 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) { if (SanitizeMemory) SafetyInfo.computeLoopSafetyInfo(L); - if (MSSA && VerifyMemorySSA) + if (VerifyMemorySSA) MSSA->verifyMemorySSA(); bool Changed = false; do { assert(CurrentLoop->isLCSSAForm(*DT)); - if (MSSA && VerifyMemorySSA) + if (VerifyMemorySSA) MSSA->verifyMemorySSA(); RedoLoop = false; Changed |= processCurrentLoop(); } while (RedoLoop); - if (MSSA && VerifyMemorySSA) + if (VerifyMemorySSA) MSSA->verifyMemorySSA(); return Changed; diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index b9cccc2af3090..a6d489f825b22 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -3126,10 +3126,8 @@ class SimpleLoopUnswitchLegacyPass : public LoopPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); - if (EnableMSSALoopDependency) { - AU.addRequired(); - AU.addPreserved(); - } + AU.addRequired(); + AU.addPreserved(); getLoopAnalysisUsage(AU); } }; @@ -3150,12 +3148,8 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { auto &AC = getAnalysis().getAssumptionCache(F); auto &AA = getAnalysis().getAAResults(); auto &TTI = getAnalysis().getTTI(F); - MemorySSA *MSSA = nullptr; - Optional MSSAU; - if (EnableMSSALoopDependency) { - MSSA = &getAnalysis().getMSSA(); - MSSAU = MemorySSAUpdater(MSSA); - } + MemorySSA *MSSA = &getAnalysis().getMSSA(); + MemorySSAUpdater MSSAU(MSSA); auto *SEWP = getAnalysisIfAvailable(); auto *SE = SEWP ? &SEWP->getSE() : nullptr; @@ -3179,14 +3173,13 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { LPM.markLoopAsDeleted(*L); }; - if (MSSA && VerifyMemorySSA) + if (VerifyMemorySSA) MSSA->verifyMemorySSA(); - bool Changed = - unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial, UnswitchCB, SE, - MSSAU.hasValue() ? MSSAU.getPointer() : nullptr); + bool Changed = unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial, + UnswitchCB, SE, &MSSAU); - if (MSSA && VerifyMemorySSA) + if (VerifyMemorySSA) MSSA->verifyMemorySSA(); // Historically this pass has had issues with the dominator tree so verify it diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp index d2fd32c98d734..d14c006c80327 100644 --- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -779,8 +779,7 @@ namespace { AU.addPreserved(); AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added. AU.addPreserved(); - if (EnableMSSALoopDependency) - AU.addPreserved(); + AU.addPreserved(); } /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees. @@ -814,12 +813,10 @@ bool LoopSimplify::runOnFunction(Function &F) { &getAnalysis().getAssumptionCache(F); MemorySSA *MSSA = nullptr; std::unique_ptr MSSAU; - if (EnableMSSALoopDependency) { - auto *MSSAAnalysis = getAnalysisIfAvailable(); - if (MSSAAnalysis) { - MSSA = &MSSAAnalysis->getMSSA(); - MSSAU = std::make_unique(MSSA); - } + auto *MSSAAnalysis = getAnalysisIfAvailable(); + if (MSSAAnalysis) { + MSSA = &MSSAAnalysis->getMSSA(); + MSSAU = std::make_unique(MSSA); } bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); diff --git a/llvm/test/Analysis/BasicAA/store-promote.ll b/llvm/test/Analysis/BasicAA/store-promote.ll index 4bb5d4cfadd66..4ba44b6614b8e 100644 --- a/llvm/test/Analysis/BasicAA/store-promote.ll +++ b/llvm/test/Analysis/BasicAA/store-promote.ll @@ -2,7 +2,6 @@ ; disambiguating some obvious cases. If LICM is able to disambiguate the ; two pointers, then the load should be hoisted, and the store sunk. -; RUN: opt < %s -basic-aa -licm -enable-mssa-loop-dependency=false -enable-new-pm=0 -S | FileCheck %s -check-prefixes=CHECK,AST ; RUN: opt < %s -basic-aa -licm -enable-new-pm=0 -S | FileCheck %s -check-prefixes=CHECK,MSSA ; RUN: opt < %s -aa-pipeline=basic-aa -passes='loop(licm)' -S | FileCheck %s -check-prefixes=CHECK,AST ; RUN: opt < %s -aa-pipeline=basic-aa -passes='loop-mssa(licm)' -S | FileCheck %s -check-prefixes=CHECK,MSSA diff --git a/llvm/test/Analysis/MemorySSA/pr42294.ll b/llvm/test/Analysis/MemorySSA/pr42294.ll index e5a687afcab71..dbf996d948145 100644 --- a/llvm/test/Analysis/MemorySSA/pr42294.ll +++ b/llvm/test/Analysis/MemorySSA/pr42294.ll @@ -1,8 +1,6 @@ ; REQUIRES: asserts ; RUN: opt -loop-rotate -licm %s -disable-output -debug-only=licm 2>&1 | FileCheck %s -check-prefix=LICM -; RUN: opt -loop-rotate -licm %s -disable-output -enable-mssa-loop-dependency=false -debug-only=licm 2>&1 | FileCheck %s -check-prefix=LICM ; RUN: opt -loop-rotate -licm %s -S | FileCheck %s -; RUN: opt -loop-rotate -licm %s -S -enable-mssa-loop-dependency=false | FileCheck %s ; LICM: Using ; LICM-NOT: LICM sinking instruction: %.pre = load i8, i8* %arrayidx.phi.trans.insert diff --git a/llvm/test/CodeGen/PowerPC/pr35688.ll b/llvm/test/CodeGen/PowerPC/pr35688.ll index fd1db8332ea6e..5b156fcc057d1 100644 --- a/llvm/test/CodeGen/PowerPC/pr35688.ll +++ b/llvm/test/CodeGen/PowerPC/pr35688.ll @@ -1,45 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -enable-mssa-loop-dependency=false -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s | \ -; RUN: FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s | \ -; RUN: FileCheck %s --check-prefix=MSSA -; Function Attrs: nounwind +; RUN: FileCheck %s + +; With MemorySSA, everything is taken out of the loop by licm. +; Loads and stores to undef are treated as non-aliasing. define void @ec_GFp_nistp256_points_mul() { ; CHECK-LABEL: ec_GFp_nistp256_points_mul: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: ld 5, 0(3) -; CHECK-NEXT: li 3, 127 +; CHECK-NEXT: ld 3, 0(3) ; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: .p2align 5 +; CHECK-NEXT: subfic 5, 3, 0 +; CHECK-NEXT: subfze 5, 4 +; CHECK-NEXT: sradi 5, 5, 63 +; CHECK-NEXT: subc 3, 5, 3 +; CHECK-NEXT: subfe 3, 4, 5 +; CHECK-NEXT: sradi 3, 3, 63 +; CHECK-NEXT: std 3, 0(3) +; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %fe_cmovznz.exit.i534.i.15 ; CHECK-NEXT: # -; CHECK-NEXT: subfic 6, 5, 0 -; CHECK-NEXT: subfze 6, 4 -; CHECK-NEXT: sradi 7, 6, 63 -; CHECK-NEXT: srad 6, 6, 3 -; CHECK-NEXT: subc 5, 7, 5 -; CHECK-NEXT: subfe 5, 4, 6 -; CHECK-NEXT: sradi 5, 5, 63 ; CHECK-NEXT: b .LBB0_1 -; -; MSSA-LABEL: ec_GFp_nistp256_points_mul: -; MSSA: # %bb.0: # %entry -; MSSA-NEXT: ld 3, 0(3) -; MSSA-NEXT: li 4, 0 -; MSSA-NEXT: subfic 5, 3, 0 -; MSSA-NEXT: subfze 5, 4 -; MSSA-NEXT: sradi 5, 5, 63 -; MSSA-NEXT: subc 3, 5, 3 -; MSSA-NEXT: subfe 3, 4, 5 -; MSSA-NEXT: sradi 3, 3, 63 -; MSSA-NEXT: std 3, 0(3) -; MSSA-NEXT: .p2align 4 -; MSSA-NEXT: .LBB0_1: # %fe_cmovznz.exit.i534.i.15 -; MSSA-NEXT: # -; MSSA-NEXT: b .LBB0_1 -; With MemorySSA, everything is taken out of the loop by licm. -; Loads and stores to undef are treated as non-aliasing. entry: br label %fe_cmovznz.exit.i534.i.15 diff --git a/llvm/test/Transforms/LICM/argmemonly-call.ll b/llvm/test/Transforms/LICM/argmemonly-call.ll index 4098daf6e7206..882e9a804266b 100644 --- a/llvm/test/Transforms/LICM/argmemonly-call.ll +++ b/llvm/test/Transforms/LICM/argmemonly-call.ll @@ -1,4 +1,3 @@ -; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 -enable-mssa-loop-dependency=false %s -enable-new-pm=0 | FileCheck %s ; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 -verify-memoryssa %s -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2 ; RUN: opt -licm -basic-aa -licm-n2-threshold=200 < %s -S -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2 diff --git a/llvm/test/Transforms/LICM/atomics.ll b/llvm/test/Transforms/LICM/atomics.ll index cfa177323490c..f5c2a08f84292 100644 --- a/llvm/test/Transforms/LICM/atomics.ll +++ b/llvm/test/Transforms/LICM/atomics.ll @@ -1,4 +1,3 @@ -; RUN: opt < %s -S -basic-aa -licm -enable-mssa-loop-dependency=false -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,AST %s ; RUN: opt < %s -S -basic-aa -licm -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,MSSA %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,loop(licm)' < %s -S | FileCheck -check-prefixes=CHECK,AST %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,loop-mssa(licm)' < %s -S | FileCheck -check-prefixes=CHECK,MSSA %s diff --git a/llvm/test/Transforms/LICM/guards.ll b/llvm/test/Transforms/LICM/guards.ll index df97cac7544de..5036d6f926b36 100644 --- a/llvm/test/Transforms/LICM/guards.ll +++ b/llvm/test/Transforms/LICM/guards.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; REQUIRES: asserts -; RUN: opt -licm -basic-aa -enable-mssa-loop-dependency=false -ipt-expensive-asserts=true < %s -S | FileCheck %s ; RUN: opt -licm -basic-aa -ipt-expensive-asserts=true < %s -S | FileCheck %s --check-prefixes=CHECK,MSSA ; RUN: opt -aa-pipeline=basic-aa -passes='require,require,require,require,loop(licm)' -ipt-expensive-asserts=true < %s -S | FileCheck %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,require,require,require,loop-mssa(licm)' -ipt-expensive-asserts=true < %s -S | FileCheck %s --check-prefixes=CHECK,MSSA diff --git a/llvm/test/Transforms/LICM/pr38513.ll b/llvm/test/Transforms/LICM/pr38513.ll index dea268d88cd73..2030475ce422f 100644 --- a/llvm/test/Transforms/LICM/pr38513.ll +++ b/llvm/test/Transforms/LICM/pr38513.ll @@ -1,4 +1,4 @@ -; RUN: opt -enable-mssa-loop-dependency=false -disable-basic-aa -alias-set-saturation-threshold=2 -licm -S < %s | FileCheck %s +; RUN: opt -disable-basic-aa -alias-set-saturation-threshold=2 -passes='loop(licm)' -S < %s | FileCheck %s ; REQUIRES: asserts ; CHECK-LABEL: @f1() diff --git a/llvm/test/Transforms/LICM/promote-order.ll b/llvm/test/Transforms/LICM/promote-order.ll index 197e94662ce81..a8a11b598873b 100644 --- a/llvm/test/Transforms/LICM/promote-order.ll +++ b/llvm/test/Transforms/LICM/promote-order.ll @@ -1,4 +1,3 @@ -; RUN: opt -tbaa -basic-aa -licm -enable-mssa-loop-dependency=false -enable-new-pm=0 -S < %s | FileCheck %s --check-prefixes=CHECK,AST ; RUN: opt -tbaa -basic-aa -licm -enable-new-pm=0 -S < %s | FileCheck %s --check-prefixes=CHECK,MSSA ; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require,require,require,require,loop(licm)' -S %s | FileCheck %s --check-prefixes=CHECK,AST ; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require,require,require,require,loop-mssa(licm)' -S %s | FileCheck %s --check-prefixes=CHECK,MSSA diff --git a/llvm/test/Transforms/LICM/read-only-calls.ll b/llvm/test/Transforms/LICM/read-only-calls.ll index 3fa242abab291..c92ff149e22d4 100644 --- a/llvm/test/Transforms/LICM/read-only-calls.ll +++ b/llvm/test/Transforms/LICM/read-only-calls.ll @@ -1,4 +1,3 @@ -; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 -enable-mssa-loop-dependency=false %s -enable-new-pm=0 | FileCheck %s ; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 %s -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2 ; RUN: opt -licm -basic-aa -licm-n2-threshold=200 < %s -S -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2 diff --git a/llvm/test/Transforms/LICM/store-hoisting.ll b/llvm/test/Transforms/LICM/store-hoisting.ll index 7795c441b7e17..636b5a013bf9b 100644 --- a/llvm/test/Transforms/LICM/store-hoisting.ll +++ b/llvm/test/Transforms/LICM/store-hoisting.ll @@ -1,4 +1,3 @@ -; RUN: opt -S -basic-aa -licm -enable-mssa-loop-dependency=false %s -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,AST %s ; RUN: opt -S -basic-aa -licm %s -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,MSSA %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,require,require,require,loop(licm)' < %s -S | FileCheck -check-prefixes=CHECK,AST %s ; RUN: opt -aa-pipeline=basic-aa -passes='require,require,require,require,loop-mssa(licm)' < %s -S | FileCheck -check-prefixes=CHECK,MSSA %s diff --git a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp index db631c221e593..bdabc34decf85 100644 --- a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp +++ b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp @@ -415,8 +415,7 @@ TEST_F(LoopPassManagerTest, FunctionPassInvalidationOfLoopAnalyses) { RequireAnalysisLoopPass())); // For 'f', preserve most things but not the specific loop analyses. auto PA = getLoopPassPreservedAnalyses(); - if (EnableMSSALoopDependency) - PA.preserve(); + PA.preserve(); EXPECT_CALL(MFPHandle, run(HasName("f"), _)) .InSequence(FSequence) .WillOnce(Return(PA)); @@ -494,8 +493,7 @@ TEST_F(LoopPassManagerTest, ModulePassInvalidationOfLoopAnalyses) { EXPECT_CALL(MMPHandle, run(_, _)).WillOnce(InvokeWithoutArgs([] { auto PA = getLoopPassPreservedAnalyses(); PA.preserve(); - if (EnableMSSALoopDependency) - PA.preserve(); + PA.preserve(); return PA; })); // All the loop analyses from both functions get invalidated before we @@ -822,8 +820,7 @@ TEST_F(LoopPassManagerTest, IndirectOuterPassInvalidation) { // the fact that they were preserved. EXPECT_CALL(MFPHandle, run(HasName("f"), _)).WillOnce(InvokeWithoutArgs([] { auto PA = getLoopPassPreservedAnalyses(); - if (EnableMSSALoopDependency) - PA.preserve(); + PA.preserve(); PA.preserveSet>(); return PA; })); @@ -845,8 +842,7 @@ TEST_F(LoopPassManagerTest, IndirectOuterPassInvalidation) { // Which means that no extra invalidation occurs and cached values are used. EXPECT_CALL(MFPHandle, run(HasName("g"), _)).WillOnce(InvokeWithoutArgs([] { auto PA = getLoopPassPreservedAnalyses(); - if (EnableMSSALoopDependency) - PA.preserve(); + PA.preserve(); PA.preserveSet>(); return PA; })); From b9e433b02a77830b9ba13406b459ab905371e346 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Mon, 9 Aug 2021 10:18:52 -0700 Subject: [PATCH 139/700] Prevent machine licm if remattable with a vreg use Check if a remateralizable nstruction does not have any virtual register uses. Even though rematerializable RA might not actually rematerialize it in this scenario. In that case we do not want to hoist such instruction out of the loop in a believe RA will sink it back if needed. This already has impact on AMDGPU target which does not check for this condition in its isTriviallyReMaterializable implementation and have instructions with virtual register uses enabled. The other targets are not impacted at this point although will be when D106408 lands. Differential Revision: https://reviews.llvm.org/D107677 --- llvm/lib/CodeGen/MachineLICM.cpp | 28 +++++++++-- llvm/test/CodeGen/AMDGPU/licm-regpressure.mir | 48 +++++++++---------- 2 files changed, 48 insertions(+), 28 deletions(-) diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 883299c452b71..42708659c79e1 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -230,6 +230,9 @@ namespace { bool IsGuaranteedToExecute(MachineBasicBlock *BB); + bool isTriviallyReMaterializable(const MachineInstr &MI, + AAResults *AA) const; + void EnterScope(MachineBasicBlock *MBB); void ExitScope(MachineBasicBlock *MBB); @@ -659,6 +662,23 @@ bool MachineLICMBase::IsGuaranteedToExecute(MachineBasicBlock *BB) { return true; } +/// Check if \p MI is trivially remateralizable and if it does not have any +/// virtual register uses. Even though rematerializable RA might not actually +/// rematerialize it in this scenario. In that case we do not want to hoist such +/// instruction out of the loop in a belief RA will sink it back if needed. +bool MachineLICMBase::isTriviallyReMaterializable(const MachineInstr &MI, + AAResults *AA) const { + if (!TII->isTriviallyReMaterializable(MI, AA)) + return false; + + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual()) + return false; + } + + return true; +} + void MachineLICMBase::EnterScope(MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "Entering " << printMBBReference(*MBB) << '\n'); @@ -1156,9 +1176,9 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) { return false; } - // Rematerializable instructions should always be hoisted since the register - // allocator can just pull them down again when needed. - if (TII->isTriviallyReMaterializable(MI, AA)) + // Rematerializable instructions should always be hoisted providing the + // register allocator can just pull them down again when needed. + if (isTriviallyReMaterializable(MI, AA)) return true; // FIXME: If there are long latency loop-invariant instructions inside the @@ -1211,7 +1231,7 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) { // High register pressure situation, only hoist if the instruction is going // to be remat'ed. - if (!TII->isTriviallyReMaterializable(MI, AA) && + if (!isTriviallyReMaterializable(MI, AA) && !MI.isDereferenceableInvariantLoad(AA)) { LLVM_DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI); return false; diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir index dcdbe6bd02dcd..1d033e117ede7 100644 --- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir +++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass machinelicm -o - %s | FileCheck -check-prefix=GCN %s -# FIXME: MachineLICM hoists all V_CVT instructions out of the loop increasing -# register pressure. VGPR budget at occupancy 10 is 24 vgprs. +# MachineLICM shall limit hoisting of V_CVT instructions out of the loop keeping +# register pressure within the budget. VGPR budget at occupancy 10 is 24 vgprs. --- name: test @@ -35,41 +35,41 @@ body: | ; GCN: %20:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY2]], implicit $mode, implicit $exec ; GCN: %21:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY3]], implicit $mode, implicit $exec ; GCN: %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY4]], implicit $mode, implicit $exec + ; GCN: bb.1: + ; GCN: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; GCN: liveins: $vcc + ; GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %18, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %19, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %20, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %21, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %22, implicit $exec ; GCN: %23:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY5]], implicit $mode, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %23, implicit $exec ; GCN: %24:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY6]], implicit $mode, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %24, implicit $exec ; GCN: %25:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY7]], implicit $mode, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %25, implicit $exec ; GCN: %26:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY8]], implicit $mode, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %26, implicit $exec ; GCN: %27:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY9]], implicit $mode, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %27, implicit $exec ; GCN: %28:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %28, implicit $exec ; GCN: %29:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %29, implicit $exec ; GCN: %30:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %30, implicit $exec ; GCN: %31:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %31, implicit $exec ; GCN: %32:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %32, implicit $exec ; GCN: %33:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY15]], implicit $mode, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %33, implicit $exec ; GCN: %34:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY16]], implicit $mode, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %34, implicit $exec ; GCN: %35:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY17]], implicit $mode, implicit $exec - ; GCN: bb.1: - ; GCN: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; GCN: liveins: $vcc - ; GCN: $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %18, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %19, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %20, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %21, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %22, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %23, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %24, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %25, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %26, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %27, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %28, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %29, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %30, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %31, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %32, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %33, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %34, implicit $exec - ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, %35, implicit $exec + ; GCN: $vcc = V_CMP_EQ_U64_e64 $vcc, killed %35, implicit $exec ; GCN: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; GCN: S_BRANCH %bb.2 ; GCN: bb.2: From 9b8425e42c2531f4ec71869800a5e4e9cc0cb5dd Mon Sep 17 00:00:00 2001 From: Rong Xu Date: Mon, 16 Aug 2021 12:16:43 -0700 Subject: [PATCH 140/700] Reapply commit b7425e956 The commit b7425e956: [NFC] fix typos is harmless but was reverted by accident. Reapply. --- .../StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp | 2 +- clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp | 2 +- compiler-rt/test/profile/Linux/instrprof-cs.c | 2 +- llvm/include/llvm/Transforms/Instrumentation.h | 4 ++-- llvm/lib/ProfileData/SampleProfReader.cpp | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp index 175dfcef0df45..a13de306eac84 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file defines a CheckObjCInstMethSignature, a flow-insenstive check +// This file defines a CheckObjCInstMethSignature, a flow-insensitive check // that determines if an Objective-C class interface incorrectly redefines // the method signature in a subclass. // diff --git a/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp index 90c5583d89691..dcca8be55e337 100644 --- a/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file defines a CheckNSError, a flow-insenstive check +// This file defines a CheckNSError, a flow-insensitive check // that determines if an Objective-C class interface correctly returns // a non-void return type. // diff --git a/compiler-rt/test/profile/Linux/instrprof-cs.c b/compiler-rt/test/profile/Linux/instrprof-cs.c index d825525a532db..0ad6f0350c560 100644 --- a/compiler-rt/test/profile/Linux/instrprof-cs.c +++ b/compiler-rt/test/profile/Linux/instrprof-cs.c @@ -8,7 +8,7 @@ // RUN: %clang_profgen=%t.profraw -o %t.gen.cis -O2 %s // RUN: %run %t.gen.cis // RUN: llvm-profdata merge -o %t.cis.profdata %t.profraw -// Check context insenstive profile +// Check context insensitive profile // RUN: %clang_profuse=%t.cis.profdata -O2 -emit-llvm -S %s -o - | FileCheck %s --check-prefix=CIS int g1 = 1; int volatile g2 = 2; diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h index 03108bacb0da5..0c822999aecf3 100644 --- a/llvm/include/llvm/Transforms/Instrumentation.h +++ b/llvm/include/llvm/Transforms/Instrumentation.h @@ -78,7 +78,7 @@ struct GCOVOptions { ModulePass *createGCOVProfilerPass(const GCOVOptions &Options = GCOVOptions::getDefault()); -// PGO Instrumention. Parameter IsCS indicates if this is the context senstive +// PGO Instrumention. Parameter IsCS indicates if this is the context sensitive // instrumentation. ModulePass *createPGOInstrumentationGenLegacyPass(bool IsCS = false); ModulePass * @@ -138,7 +138,7 @@ struct InstrProfOptions { }; /// Insert frontend instrumentation based profiling. Parameter IsCS indicates if -// this is the context senstive instrumentation. +// this is the context sensitive instrumentation. ModulePass *createInstrProfilingLegacyPass( const InstrProfOptions &Options = InstrProfOptions(), bool IsCS = false); diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp index 6058eddb13dc7..a801ca1ef36d7 100644 --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -53,7 +53,7 @@ using namespace sampleprof; // For ext-binary format profiles, the flag is set in the summary. static cl::opt ProfileIsFSDisciminator( "profile-isfs", cl::Hidden, cl::init(false), - cl::desc("Profile uses flow senstive discriminators")); + cl::desc("Profile uses flow sensitive discriminators")); /// Dump the function profile for \p FName. /// From b51e71fe66b3d08c94f419003b72baa12afbf51e Mon Sep 17 00:00:00 2001 From: Geoffrey Martin-Noble Date: Mon, 16 Aug 2021 12:12:38 -0700 Subject: [PATCH 141/700] [Bazel] Update for 957334382c Update LLVM configuration to define `HAVE_UNW_ADD_DYNAMIC_FDE` for macOS since https://github.com/llvm/llvm-project/commit/957334382c moved that to a define. Differential Revision: https://reviews.llvm.org/D108157 --- utils/bazel/llvm-project-overlay/llvm/config.bzl | 1 + .../llvm-project-overlay/llvm/include/llvm/Config/config.h | 3 +++ utils/bazel/llvm_configs/config.h.cmake | 3 +++ 3 files changed, 7 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl index 36e17eecfd7b6..a52e35d2af153 100644 --- a/utils/bazel/llvm-project-overlay/llvm/config.bzl +++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl @@ -49,6 +49,7 @@ macos_defines = posix_defines + [ "HAVE_MALLOC_MALLOC_H=1", "HAVE_MALLOC_ZONE_STATISTICS=1", "HAVE_PROC_PID_RUSAGE=1", + "HAVE_UNW_ADD_DYNAMIC_FDE=1", ] win32_defines = [ diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h index f24768d6c4a6b..4f529b008a4d1 100644 --- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h +++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h @@ -80,6 +80,9 @@ /* Define to 1 if we can deregister EH frames on this platform. */ #define HAVE_DEREGISTER_FRAME 1 +/* Define if __unw_add_dynamic_fde() is available on this platform. */ +/* HAVE_UNW_ADD_DYNAMIC_FDE defined in Bazel */ + /* Define to 1 if you have the header file. */ #define HAVE_ERRNO_H 1 diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake index 8d58ec9d665b3..d7cd44b5db36a 100644 --- a/utils/bazel/llvm_configs/config.h.cmake +++ b/utils/bazel/llvm_configs/config.h.cmake @@ -64,6 +64,9 @@ /* Define to 1 if we can deregister EH frames on this platform. */ #cmakedefine HAVE_DEREGISTER_FRAME ${HAVE_DEREGISTER_FRAME} +/* Define if __unw_add_dynamic_fde() is available on this platform. */ +#cmakedefine HAVE_UNW_ADD_DYNAMIC_FDE ${HAVE_UNW_ADD_DYNAMIC_FDE} + /* Define to 1 if you have the header file. */ #cmakedefine HAVE_ERRNO_H ${HAVE_ERRNO_H} From 6c0e6f91d7f02cecdf11efb26050c48680a806ce Mon Sep 17 00:00:00 2001 From: Gulfem Savrun Yeniceri Date: Mon, 16 Aug 2021 18:55:31 +0000 Subject: [PATCH 142/700] [profile] Add static keyword to binary id functions This patch adds static keyword to internal functions that write binary id to restrict visibility to the file that they are declared. Differential Revision: https://reviews.llvm.org/D108154 --- compiler-rt/lib/profile/InstrProfilingPlatformLinux.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c index 7c15f97aff898..5d47083b8bfe7 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c @@ -94,8 +94,8 @@ static size_t RoundUp(size_t size, size_t align) { * Write binary id length and then its data, because binary id does not * have a fixed length. */ -int WriteOneBinaryId(ProfDataWriter *Writer, uint64_t BinaryIdLen, - const uint8_t *BinaryIdData) { +static int WriteOneBinaryId(ProfDataWriter *Writer, uint64_t BinaryIdLen, + const uint8_t *BinaryIdData) { ProfDataIOVec BinaryIdIOVec[] = { {&BinaryIdLen, sizeof(uint64_t), 1, 0}, {BinaryIdData, sizeof(uint8_t), BinaryIdLen, 0}}; @@ -119,7 +119,8 @@ int WriteOneBinaryId(ProfDataWriter *Writer, uint64_t BinaryIdLen, * Note sections like .note.ABI-tag and .note.gnu.build-id are aligned * to 4 bytes, so round n_namesz and n_descsz to the nearest 4 bytes. */ -int WriteBinaryIdForNote(ProfDataWriter *Writer, const ElfW(Nhdr) * Note) { +static int WriteBinaryIdForNote(ProfDataWriter *Writer, + const ElfW(Nhdr) * Note) { int BinaryIdSize = 0; const char *NoteName = (const char *)Note + sizeof(ElfW(Nhdr)); @@ -144,8 +145,8 @@ int WriteBinaryIdForNote(ProfDataWriter *Writer, const ElfW(Nhdr) * Note) { * If writer is given, write binary ids into profiles. * If an error happens while writing, return -1. */ -int WriteBinaryIds(ProfDataWriter *Writer, const ElfW(Nhdr) * Note, - const ElfW(Nhdr) * NotesEnd) { +static int WriteBinaryIds(ProfDataWriter *Writer, const ElfW(Nhdr) * Note, + const ElfW(Nhdr) * NotesEnd) { int TotalBinaryIdsSize = 0; while (Note < NotesEnd) { int Result = WriteBinaryIdForNote(Writer, Note); From 15dc93e61c21abe1966484cd3f091d63616f29ab Mon Sep 17 00:00:00 2001 From: Vincent Lee Date: Wed, 11 Aug 2021 22:08:56 -0700 Subject: [PATCH 143/700] [lld-macho] Ignore LLVM segments to prevent duplicate syms There was an instance of a third-party archive containing multiple _llvm symbols from different files that clashed with each other producing duplicate symbols. Symbols under the LLVM segment don't seem to be producing any meaningful value, so just ignore them. Reviewed By: #lld-macho, int3 Differential Revision: https://reviews.llvm.org/D108016 --- lld/MachO/InputFiles.cpp | 16 ++++++--- lld/test/MachO/discard-llvm-sections.s | 46 ++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 4 deletions(-) create mode 100644 lld/test/MachO/discard-llvm-sections.s diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp index 04cdd981929b9..32279da1cf627 100644 --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -282,16 +282,24 @@ void ObjFile::parseSections(ArrayRef
sections) { } else { auto *isec = make(segname, name, this, data, align, flags); - if (!(isDebugSection(isec->getFlags()) && - isec->getSegName() == segment_names::dwarf)) { - subsections.push_back({{0, isec}}); - } else { + if (isDebugSection(isec->getFlags()) && + isec->getSegName() == segment_names::dwarf) { // Instead of emitting DWARF sections, we emit STABS symbols to the // object files that contain them. We filter them out early to avoid // parsing their relocations unnecessarily. But we must still push an // empty map to ensure the indices line up for the remaining sections. subsections.push_back({}); debugSections.push_back(isec); + } else if (isec->getSegName() == segment_names::llvm) { + // ld64 does not appear to emit contents from sections within the __LLVM + // segment. Symbols within those sections point to bitcode metadata + // instead of actual symbols. Global symbols within those sections could + // have the same name without causing duplicate symbol errors. Push an + // empty map to ensure indices line up for the remaining sections. + // TODO: Evaluate whether the bitcode metadata is needed. + subsections.push_back({}); + } else { + subsections.push_back({{0, isec}}); } } } diff --git a/lld/test/MachO/discard-llvm-sections.s b/lld/test/MachO/discard-llvm-sections.s new file mode 100644 index 0000000000000..571fdbb0b2da0 --- /dev/null +++ b/lld/test/MachO/discard-llvm-sections.s @@ -0,0 +1,46 @@ +# REQUIRES: x86 +# RUN: rm -rf %t; split-file %s %t +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/foo.s -o %t/foo.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/bar.s -o %t/bar.o + +## "_llvm." symbols are not special. LLD would produce duplicate symbol errors +## if they were not within the LLVM segment. + +## 1/ Test that LLD does not produce duplicate symbols errors when linking global symbols +## with the same name under the LLVM segment. +# RUN: %lld -dylib %t/foo.o %t/bar.o -o %t/libDuplicate.dylib + +## 2/ Test that all sections within an LLVM segment are dropped. +# RUN: llvm-objdump --section-headers %t/libDuplicate.dylib | FileCheck %s + +# CHECK-LABEL: Sections: +# CHECK-NEXT: Idx Name Size VMA Type +# CHECK-NEXT: 0 __text 00000000 {{[0-9a-f]+}} TEXT + +## 3/ Test that linking global symbol that is not under the LLVM segment produces duplicate +## symbols +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin --defsym TEXT=0 %t/foo.s -o %t/foo.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin --defsym TEXT=0 %t/bar.s -o %t/bar.o +# RUN: not %lld -dylib %t/foo.o %t/bar.o -o %t/libDuplicate.dylib 2>&1 | FileCheck %s --check-prefix=DUP + +# DUP: ld64.lld: error: duplicate symbol: _llvm.foo + +#--- foo.s +.globl _llvm.foo +.ifdef TEXT + .section __TEXT,__cstring +.else + .section __LLVM,__bitcode +.endif + _llvm.foo: + .asciz "test" + +#--- bar.s +.globl _llvm.foo +.ifdef TEXT + .section __TEXT,__cstring +.else + .section __LLVM,__bitcode +.endif + _llvm.foo: + .asciz "test" From 877572cc193a470f310eec46a7ce793a6cc97c2f Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Mon, 9 Aug 2021 11:12:15 -0700 Subject: [PATCH 144/700] Allow rematerialization of virtual reg uses Currently isReallyTriviallyReMaterializableGeneric() implementation prevents rematerialization on any virtual register use on the grounds that is not a trivial rematerialization and that we do not want to extend liveranges. It appears that LRE logic does not attempt to extend a liverange of a source register for rematerialization so that is not an issue. That is checked in the LiveRangeEdit::allUsesAvailableAt(). The only non-trivial aspect of it is accounting for tied-defs which normally represent a read-modify-write operation and not rematerializable. The test for a tied-def situation already exists in the /CodeGen/AMDGPU/remat-vop.mir, test_no_remat_v_cvt_f32_i32_sdwa_dst_unused_preserve. The change has affected ARM/Thumb, Mips, RISCV, and x86. For the targets where I more or less understand the asm it seems to reduce spilling (as expected) or be neutral. However, it needs a review by all targets' specialists. Differential Revision: https://reviews.llvm.org/D106408 --- llvm/include/llvm/CodeGen/TargetInstrInfo.h | 12 +- llvm/lib/CodeGen/TargetInstrInfo.cpp | 9 +- llvm/test/CodeGen/AMDGPU/remat-sop.mir | 60 + .../CodeGen/ARM/arm-shrink-wrapping-linux.ll | 28 +- llvm/test/CodeGen/ARM/funnel-shift-rot.ll | 32 +- llvm/test/CodeGen/ARM/funnel-shift.ll | 30 +- .../CodeGen/ARM/illegal-bitfield-loadstore.ll | 30 +- llvm/test/CodeGen/ARM/neon-copy.ll | 10 +- llvm/test/CodeGen/Mips/llvm-ir/ashr.ll | 227 +- llvm/test/CodeGen/Mips/llvm-ir/lshr.ll | 206 +- llvm/test/CodeGen/Mips/llvm-ir/shl.ll | 95 +- llvm/test/CodeGen/Mips/llvm-ir/sub.ll | 31 +- llvm/test/CodeGen/Mips/tls.ll | 4 +- llvm/test/CodeGen/RISCV/atomic-rmw.ll | 120 +- llvm/test/CodeGen/RISCV/atomic-signext.ll | 24 +- .../CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll | 96 +- llvm/test/CodeGen/RISCV/rv32i-rv64i-half.ll | 12 +- llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll | 526 +-- llvm/test/CodeGen/RISCV/rv32zbb.ll | 94 +- llvm/test/CodeGen/RISCV/rv32zbp.ll | 282 +- llvm/test/CodeGen/RISCV/rv32zbt.ll | 348 +- .../RISCV/rvv/fixed-vectors-bitreverse.ll | 324 +- .../CodeGen/RISCV/rvv/fixed-vectors-bswap.ll | 146 +- .../CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll | 3540 ++++++++--------- .../CodeGen/RISCV/rvv/fixed-vectors-cttz.ll | 720 ++-- llvm/test/CodeGen/RISCV/srem-vector-lkk.ll | 208 +- llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 190 +- llvm/test/CodeGen/Thumb/dyn-stackalloc.ll | 7 +- .../tail-pred-disabled-in-loloops.ll | 14 +- .../varying-outer-2d-reduction.ll | 64 +- .../Thumb2/LowOverheadLoops/while-loops.ll | 67 +- llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll | 30 +- .../CodeGen/Thumb2/mve-float16regloops.ll | 82 +- .../CodeGen/Thumb2/mve-float32regloops.ll | 98 +- llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll | 525 ++- llvm/test/CodeGen/X86/addcarry.ll | 20 +- .../CodeGen/X86/callbr-asm-blockplacement.ll | 12 +- .../CodeGen/X86/dag-update-nodetomatch.ll | 17 +- llvm/test/CodeGen/X86/inalloca-invoke.ll | 2 +- llvm/test/CodeGen/X86/licm-regpressure.ll | 28 +- llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll | 40 +- llvm/test/CodeGen/X86/sdiv_fix.ll | 5 +- 42 files changed, 4215 insertions(+), 4200 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index 2f853a2c6f9f5..1c05afba730d5 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -117,10 +117,11 @@ class TargetInstrInfo : public MCInstrInfo { const MachineFunction &MF) const; /// Return true if the instruction is trivially rematerializable, meaning it - /// has no side effects and requires no operands that aren't always available. - /// This means the only allowed uses are constants and unallocatable physical - /// registers so that the instructions result is independent of the place - /// in the function. + /// has no side effects. Uses of constants and unallocatable physical + /// registers are always trivial to rematerialize so that the instructions + /// result is independent of the place in the function. Uses of virtual + /// registers are allowed but it is caller's responsility to ensure these + /// operands are valid at the point the instruction is beeing moved. bool isTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA = nullptr) const { return MI.getOpcode() == TargetOpcode::IMPLICIT_DEF || @@ -140,8 +141,7 @@ class TargetInstrInfo : public MCInstrInfo { /// set, this hook lets the target specify whether the instruction is actually /// trivially rematerializable, taking into consideration its operands. This /// predicate must return false if the instruction has any side effects other - /// than producing a value, or if it requres any address registers that are - /// not always available. + /// than producing a value. /// Requirements must be check as stated in isTriviallyReMaterializable() . virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA) const { diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 1eab8e7443a73..fe7d60e0b7e22 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -921,7 +921,8 @@ bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric( const MachineRegisterInfo &MRI = MF.getRegInfo(); // Remat clients assume operand 0 is the defined register. - if (!MI.getNumOperands() || !MI.getOperand(0).isReg()) + if (!MI.getNumOperands() || !MI.getOperand(0).isReg() || + MI.getOperand(0).isTied()) return false; Register DefReg = MI.getOperand(0).getReg(); @@ -983,12 +984,6 @@ bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric( // same virtual register, though. if (MO.isDef() && Reg != DefReg) return false; - - // Don't allow any virtual-register uses. Rematting an instruction with - // virtual register uses would length the live ranges of the uses, which - // is not necessarily a good idea, certainly not "trivial". - if (MO.isUse()) - return false; } // Everything checked out. diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir index ed799bfca0283..c9915aaabfde6 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir +++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir @@ -51,6 +51,66 @@ body: | S_NOP 0, implicit %2 S_ENDPGM 0 ... +# The liverange of %0 covers a point of rematerialization, source value is +# availabe. +--- +name: test_remat_s_mov_b32_vreg_src_long_lr +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: $sgpr32 +body: | + bb.0: + ; GCN-LABEL: name: test_remat_s_mov_b32_vreg_src_long_lr + ; GCN: renamable $sgpr0 = IMPLICIT_DEF + ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 + ; GCN: S_NOP 0, implicit killed renamable $sgpr1 + ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 + ; GCN: S_NOP 0, implicit killed renamable $sgpr1 + ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 + ; GCN: S_NOP 0, implicit killed renamable $sgpr1 + ; GCN: S_NOP 0, implicit killed renamable $sgpr0 + ; GCN: S_ENDPGM 0 + %0:sreg_32 = IMPLICIT_DEF + %1:sreg_32 = S_MOV_B32 %0:sreg_32 + %2:sreg_32 = S_MOV_B32 %0:sreg_32 + %3:sreg_32 = S_MOV_B32 %0:sreg_32 + S_NOP 0, implicit %1 + S_NOP 0, implicit %2 + S_NOP 0, implicit %3 + S_NOP 0, implicit %0 + S_ENDPGM 0 +... +# The liverange of %0 does not cover a point of rematerialization, source value is +# unavailabe and we do not want to artificially extend the liverange. +--- +name: test_no_remat_s_mov_b32_vreg_src_short_lr +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: $sgpr32 +body: | + bb.0: + ; GCN-LABEL: name: test_no_remat_s_mov_b32_vreg_src_short_lr + ; GCN: renamable $sgpr0 = IMPLICIT_DEF + ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 + ; GCN: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5) + ; GCN: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 + ; GCN: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) + ; GCN: renamable $sgpr0 = S_MOV_B32 killed renamable $sgpr0 + ; GCN: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) + ; GCN: S_NOP 0, implicit killed renamable $sgpr1 + ; GCN: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) + ; GCN: S_NOP 0, implicit killed renamable $sgpr1 + ; GCN: S_NOP 0, implicit killed renamable $sgpr0 + ; GCN: S_ENDPGM 0 + %0:sreg_32 = IMPLICIT_DEF + %1:sreg_32 = S_MOV_B32 %0:sreg_32 + %2:sreg_32 = S_MOV_B32 %0:sreg_32 + %3:sreg_32 = S_MOV_B32 %0:sreg_32 + S_NOP 0, implicit %1 + S_NOP 0, implicit %2 + S_NOP 0, implicit %3 + S_ENDPGM 0 +... --- name: test_remat_s_mov_b64 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll index a4243276c70a4..175a2069a4418 100644 --- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll +++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll @@ -29,20 +29,20 @@ define fastcc i8* @wrongUseOfPostDominate(i8* readonly %s, i32 %off, i8* readnon ; ENABLE-NEXT: pophs {r11, pc} ; ENABLE-NEXT: .LBB0_3: @ %while.body.preheader ; ENABLE-NEXT: movw r12, :lower16:skip -; ENABLE-NEXT: sub r1, r1, #1 +; ENABLE-NEXT: sub r3, r1, #1 ; ENABLE-NEXT: movt r12, :upper16:skip ; ENABLE-NEXT: .LBB0_4: @ %while.body ; ENABLE-NEXT: @ =>This Inner Loop Header: Depth=1 -; ENABLE-NEXT: ldrb r3, [r0] -; ENABLE-NEXT: ldrb r3, [r12, r3] -; ENABLE-NEXT: add r0, r0, r3 -; ENABLE-NEXT: sub r3, r1, #1 -; ENABLE-NEXT: cmp r3, r1 +; ENABLE-NEXT: ldrb r1, [r0] +; ENABLE-NEXT: ldrb r1, [r12, r1] +; ENABLE-NEXT: add r0, r0, r1 +; ENABLE-NEXT: sub r1, r3, #1 +; ENABLE-NEXT: cmp r1, r3 ; ENABLE-NEXT: bhs .LBB0_6 ; ENABLE-NEXT: @ %bb.5: @ %while.body ; ENABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLE-NEXT: cmp r0, r2 -; ENABLE-NEXT: mov r1, r3 +; ENABLE-NEXT: mov r3, r1 ; ENABLE-NEXT: blo .LBB0_4 ; ENABLE-NEXT: .LBB0_6: @ %if.end29 ; ENABLE-NEXT: pop {r11, pc} @@ -119,20 +119,20 @@ define fastcc i8* @wrongUseOfPostDominate(i8* readonly %s, i32 %off, i8* readnon ; DISABLE-NEXT: pophs {r11, pc} ; DISABLE-NEXT: .LBB0_3: @ %while.body.preheader ; DISABLE-NEXT: movw r12, :lower16:skip -; DISABLE-NEXT: sub r1, r1, #1 +; DISABLE-NEXT: sub r3, r1, #1 ; DISABLE-NEXT: movt r12, :upper16:skip ; DISABLE-NEXT: .LBB0_4: @ %while.body ; DISABLE-NEXT: @ =>This Inner Loop Header: Depth=1 -; DISABLE-NEXT: ldrb r3, [r0] -; DISABLE-NEXT: ldrb r3, [r12, r3] -; DISABLE-NEXT: add r0, r0, r3 -; DISABLE-NEXT: sub r3, r1, #1 -; DISABLE-NEXT: cmp r3, r1 +; DISABLE-NEXT: ldrb r1, [r0] +; DISABLE-NEXT: ldrb r1, [r12, r1] +; DISABLE-NEXT: add r0, r0, r1 +; DISABLE-NEXT: sub r1, r3, #1 +; DISABLE-NEXT: cmp r1, r3 ; DISABLE-NEXT: bhs .LBB0_6 ; DISABLE-NEXT: @ %bb.5: @ %while.body ; DISABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; DISABLE-NEXT: cmp r0, r2 -; DISABLE-NEXT: mov r1, r3 +; DISABLE-NEXT: mov r3, r1 ; DISABLE-NEXT: blo .LBB0_4 ; DISABLE-NEXT: .LBB0_6: @ %if.end29 ; DISABLE-NEXT: pop {r11, pc} diff --git a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll index 55157875d355f..ea15fcc5c824e 100644 --- a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll @@ -73,13 +73,13 @@ define i64 @rotl_i64(i64 %x, i64 %z) { ; SCALAR-NEXT: push {r4, r5, r11, lr} ; SCALAR-NEXT: rsb r3, r2, #0 ; SCALAR-NEXT: and r4, r2, #63 -; SCALAR-NEXT: and lr, r3, #63 -; SCALAR-NEXT: rsb r3, lr, #32 +; SCALAR-NEXT: and r12, r3, #63 +; SCALAR-NEXT: rsb r3, r12, #32 ; SCALAR-NEXT: lsl r2, r0, r4 -; SCALAR-NEXT: lsr r12, r0, lr -; SCALAR-NEXT: orr r3, r12, r1, lsl r3 -; SCALAR-NEXT: subs r12, lr, #32 -; SCALAR-NEXT: lsrpl r3, r1, r12 +; SCALAR-NEXT: lsr lr, r0, r12 +; SCALAR-NEXT: orr r3, lr, r1, lsl r3 +; SCALAR-NEXT: subs lr, r12, #32 +; SCALAR-NEXT: lsrpl r3, r1, lr ; SCALAR-NEXT: subs r5, r4, #32 ; SCALAR-NEXT: movwpl r2, #0 ; SCALAR-NEXT: cmp r5, #0 @@ -88,8 +88,8 @@ define i64 @rotl_i64(i64 %x, i64 %z) { ; SCALAR-NEXT: lsr r3, r0, r3 ; SCALAR-NEXT: orr r3, r3, r1, lsl r4 ; SCALAR-NEXT: lslpl r3, r0, r5 -; SCALAR-NEXT: lsr r0, r1, lr -; SCALAR-NEXT: cmp r12, #0 +; SCALAR-NEXT: lsr r0, r1, r12 +; SCALAR-NEXT: cmp lr, #0 ; SCALAR-NEXT: movwpl r0, #0 ; SCALAR-NEXT: orr r1, r3, r0 ; SCALAR-NEXT: mov r0, r2 @@ -245,15 +245,15 @@ define i64 @rotr_i64(i64 %x, i64 %z) { ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r11, lr} ; CHECK-NEXT: push {r4, r5, r11, lr} -; CHECK-NEXT: and lr, r2, #63 +; CHECK-NEXT: and r12, r2, #63 ; CHECK-NEXT: rsb r2, r2, #0 -; CHECK-NEXT: rsb r3, lr, #32 +; CHECK-NEXT: rsb r3, r12, #32 ; CHECK-NEXT: and r4, r2, #63 -; CHECK-NEXT: lsr r12, r0, lr -; CHECK-NEXT: orr r3, r12, r1, lsl r3 -; CHECK-NEXT: subs r12, lr, #32 +; CHECK-NEXT: lsr lr, r0, r12 +; CHECK-NEXT: orr r3, lr, r1, lsl r3 +; CHECK-NEXT: subs lr, r12, #32 ; CHECK-NEXT: lsl r2, r0, r4 -; CHECK-NEXT: lsrpl r3, r1, r12 +; CHECK-NEXT: lsrpl r3, r1, lr ; CHECK-NEXT: subs r5, r4, #32 ; CHECK-NEXT: movwpl r2, #0 ; CHECK-NEXT: cmp r5, #0 @@ -262,8 +262,8 @@ define i64 @rotr_i64(i64 %x, i64 %z) { ; CHECK-NEXT: lsr r3, r0, r3 ; CHECK-NEXT: orr r3, r3, r1, lsl r4 ; CHECK-NEXT: lslpl r3, r0, r5 -; CHECK-NEXT: lsr r0, r1, lr -; CHECK-NEXT: cmp r12, #0 +; CHECK-NEXT: lsr r0, r1, r12 +; CHECK-NEXT: cmp lr, #0 ; CHECK-NEXT: movwpl r0, #0 ; CHECK-NEXT: orr r1, r0, r3 ; CHECK-NEXT: mov r0, r2 diff --git a/llvm/test/CodeGen/ARM/funnel-shift.ll b/llvm/test/CodeGen/ARM/funnel-shift.ll index 54c93b493c981..6372f9be2ca3a 100644 --- a/llvm/test/CodeGen/ARM/funnel-shift.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift.ll @@ -224,31 +224,31 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) { ; CHECK-NEXT: mov r3, #0 ; CHECK-NEXT: bl __aeabi_uldivmod ; CHECK-NEXT: add r0, r2, #27 -; CHECK-NEXT: lsl r6, r6, #27 -; CHECK-NEXT: and r1, r0, #63 ; CHECK-NEXT: lsl r2, r7, #27 +; CHECK-NEXT: and r12, r0, #63 +; CHECK-NEXT: lsl r6, r6, #27 ; CHECK-NEXT: orr r7, r6, r7, lsr #5 +; CHECK-NEXT: rsb r3, r12, #32 +; CHECK-NEXT: lsr r2, r2, r12 ; CHECK-NEXT: mov r6, #63 -; CHECK-NEXT: rsb r3, r1, #32 -; CHECK-NEXT: lsr r2, r2, r1 -; CHECK-NEXT: subs r12, r1, #32 -; CHECK-NEXT: bic r6, r6, r0 ; CHECK-NEXT: orr r2, r2, r7, lsl r3 +; CHECK-NEXT: subs r3, r12, #32 +; CHECK-NEXT: bic r6, r6, r0 ; CHECK-NEXT: lsl r5, r9, #1 -; CHECK-NEXT: lsrpl r2, r7, r12 +; CHECK-NEXT: lsrpl r2, r7, r3 +; CHECK-NEXT: subs r1, r6, #32 ; CHECK-NEXT: lsl r0, r5, r6 -; CHECK-NEXT: subs r4, r6, #32 -; CHECK-NEXT: lsl r3, r8, #1 +; CHECK-NEXT: lsl r4, r8, #1 ; CHECK-NEXT: movwpl r0, #0 -; CHECK-NEXT: orr r3, r3, r9, lsr #31 +; CHECK-NEXT: orr r4, r4, r9, lsr #31 ; CHECK-NEXT: orr r0, r0, r2 ; CHECK-NEXT: rsb r2, r6, #32 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: lsr r1, r7, r1 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: lsr r2, r5, r2 -; CHECK-NEXT: orr r2, r2, r3, lsl r6 -; CHECK-NEXT: lslpl r2, r5, r4 -; CHECK-NEXT: cmp r12, #0 +; CHECK-NEXT: orr r2, r2, r4, lsl r6 +; CHECK-NEXT: lslpl r2, r5, r1 +; CHECK-NEXT: lsr r1, r7, r12 +; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: movwpl r1, #0 ; CHECK-NEXT: orr r1, r2, r1 ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} diff --git a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll index 2922e0ed54230..0a0bb62b0a093 100644 --- a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll @@ -91,17 +91,17 @@ define void @i56_or(i56* %a) { ; BE-LABEL: i56_or: ; BE: @ %bb.0: ; BE-NEXT: mov r1, r0 -; BE-NEXT: ldr r12, [r0] ; BE-NEXT: ldrh r2, [r1, #4]! ; BE-NEXT: ldrb r3, [r1, #2] ; BE-NEXT: orr r2, r3, r2, lsl #8 -; BE-NEXT: orr r2, r2, r12, lsl #24 -; BE-NEXT: orr r2, r2, #384 -; BE-NEXT: strb r2, [r1, #2] -; BE-NEXT: lsr r3, r2, #8 -; BE-NEXT: strh r3, [r1] -; BE-NEXT: bic r1, r12, #255 -; BE-NEXT: orr r1, r1, r2, lsr #24 +; BE-NEXT: ldr r3, [r0] +; BE-NEXT: orr r2, r2, r3, lsl #24 +; BE-NEXT: orr r12, r2, #384 +; BE-NEXT: strb r12, [r1, #2] +; BE-NEXT: lsr r2, r12, #8 +; BE-NEXT: strh r2, [r1] +; BE-NEXT: bic r1, r3, #255 +; BE-NEXT: orr r1, r1, r12, lsr #24 ; BE-NEXT: str r1, [r0] ; BE-NEXT: mov pc, lr %aa = load i56, i56* %a @@ -127,13 +127,13 @@ define void @i56_and_or(i56* %a) { ; BE-NEXT: ldrb r3, [r1, #2] ; BE-NEXT: strb r2, [r1, #2] ; BE-NEXT: orr r2, r3, r12, lsl #8 -; BE-NEXT: ldr r12, [r0] -; BE-NEXT: orr r2, r2, r12, lsl #24 -; BE-NEXT: orr r2, r2, #384 -; BE-NEXT: lsr r3, r2, #8 -; BE-NEXT: strh r3, [r1] -; BE-NEXT: bic r1, r12, #255 -; BE-NEXT: orr r1, r1, r2, lsr #24 +; BE-NEXT: ldr r3, [r0] +; BE-NEXT: orr r2, r2, r3, lsl #24 +; BE-NEXT: orr r12, r2, #384 +; BE-NEXT: lsr r2, r12, #8 +; BE-NEXT: strh r2, [r1] +; BE-NEXT: bic r1, r3, #255 +; BE-NEXT: orr r1, r1, r12, lsr #24 ; BE-NEXT: str r1, [r0] ; BE-NEXT: mov pc, lr diff --git a/llvm/test/CodeGen/ARM/neon-copy.ll b/llvm/test/CodeGen/ARM/neon-copy.ll index 09a991da2e59a..46490efb6631a 100644 --- a/llvm/test/CodeGen/ARM/neon-copy.ll +++ b/llvm/test/CodeGen/ARM/neon-copy.ll @@ -1340,16 +1340,16 @@ define <4 x i16> @test_extracts_inserts_varidx_insert(<8 x i16> %x, i32 %idx) { ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, sp, #8 ; CHECK-NEXT: vmov.u16 r1, d0[1] -; CHECK-NEXT: and r0, r0, #3 +; CHECK-NEXT: and r12, r0, #3 ; CHECK-NEXT: vmov.u16 r2, d0[2] -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: vmov.u16 r12, d0[3] -; CHECK-NEXT: orr r0, r3, r0, lsl #1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmov.u16 r3, d0[3] +; CHECK-NEXT: orr r0, r0, r12, lsl #1 ; CHECK-NEXT: vst1.16 {d0[0]}, [r0:16] ; CHECK-NEXT: vldr d0, [sp] ; CHECK-NEXT: vmov.16 d0[1], r1 ; CHECK-NEXT: vmov.16 d0[2], r2 -; CHECK-NEXT: vmov.16 d0[3], r12 +; CHECK-NEXT: vmov.16 d0[3], r3 ; CHECK-NEXT: add sp, sp, #8 ; CHECK-NEXT: bx lr %tmp = extractelement <8 x i16> %x, i32 0 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll index 8be7100d368bb..a125446b27c3a 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll @@ -766,79 +766,85 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) { ; MMR3-NEXT: .cfi_offset 17, -4 ; MMR3-NEXT: .cfi_offset 16, -8 ; MMR3-NEXT: move $8, $7 -; MMR3-NEXT: sw $6, 32($sp) # 4-byte Folded Spill -; MMR3-NEXT: sw $5, 36($sp) # 4-byte Folded Spill -; MMR3-NEXT: sw $4, 8($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $2, $6 +; MMR3-NEXT: sw $5, 0($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $4, 12($sp) # 4-byte Folded Spill ; MMR3-NEXT: lw $16, 76($sp) -; MMR3-NEXT: srlv $4, $7, $16 -; MMR3-NEXT: not16 $3, $16 -; MMR3-NEXT: sw $3, 24($sp) # 4-byte Folded Spill -; MMR3-NEXT: sll16 $2, $6, 1 -; MMR3-NEXT: sllv $3, $2, $3 -; MMR3-NEXT: li16 $2, 64 -; MMR3-NEXT: or16 $3, $4 -; MMR3-NEXT: srlv $6, $6, $16 -; MMR3-NEXT: sw $6, 12($sp) # 4-byte Folded Spill -; MMR3-NEXT: subu16 $7, $2, $16 +; MMR3-NEXT: srlv $3, $7, $16 +; MMR3-NEXT: not16 $6, $16 +; MMR3-NEXT: sw $6, 24($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $4, $2 +; MMR3-NEXT: sw $2, 32($sp) # 4-byte Folded Spill +; MMR3-NEXT: sll16 $2, $2, 1 +; MMR3-NEXT: sllv $2, $2, $6 +; MMR3-NEXT: li16 $6, 64 +; MMR3-NEXT: or16 $2, $3 +; MMR3-NEXT: srlv $4, $4, $16 +; MMR3-NEXT: sw $4, 16($sp) # 4-byte Folded Spill +; MMR3-NEXT: subu16 $7, $6, $16 ; MMR3-NEXT: sllv $9, $5, $7 -; MMR3-NEXT: andi16 $2, $7, 32 -; MMR3-NEXT: sw $2, 28($sp) # 4-byte Folded Spill -; MMR3-NEXT: andi16 $5, $16, 32 -; MMR3-NEXT: sw $5, 16($sp) # 4-byte Folded Spill -; MMR3-NEXT: move $4, $9 +; MMR3-NEXT: andi16 $5, $7, 32 +; MMR3-NEXT: sw $5, 28($sp) # 4-byte Folded Spill +; MMR3-NEXT: andi16 $6, $16, 32 +; MMR3-NEXT: sw $6, 36($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $3, $9 ; MMR3-NEXT: li16 $17, 0 -; MMR3-NEXT: movn $4, $17, $2 -; MMR3-NEXT: movn $3, $6, $5 -; MMR3-NEXT: addiu $2, $16, -64 -; MMR3-NEXT: lw $5, 36($sp) # 4-byte Folded Reload -; MMR3-NEXT: srlv $5, $5, $2 -; MMR3-NEXT: sw $5, 20($sp) # 4-byte Folded Spill -; MMR3-NEXT: lw $17, 8($sp) # 4-byte Folded Reload -; MMR3-NEXT: sll16 $6, $17, 1 -; MMR3-NEXT: sw $6, 4($sp) # 4-byte Folded Spill -; MMR3-NEXT: not16 $5, $2 -; MMR3-NEXT: sllv $5, $6, $5 -; MMR3-NEXT: or16 $3, $4 -; MMR3-NEXT: lw $4, 20($sp) # 4-byte Folded Reload -; MMR3-NEXT: or16 $5, $4 -; MMR3-NEXT: srav $1, $17, $2 -; MMR3-NEXT: andi16 $2, $2, 32 -; MMR3-NEXT: sw $2, 20($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $5, $1, $2 -; MMR3-NEXT: sllv $2, $17, $7 -; MMR3-NEXT: not16 $4, $7 -; MMR3-NEXT: lw $7, 36($sp) # 4-byte Folded Reload -; MMR3-NEXT: srl16 $6, $7, 1 -; MMR3-NEXT: srlv $6, $6, $4 +; MMR3-NEXT: movn $3, $17, $5 +; MMR3-NEXT: movn $2, $4, $6 +; MMR3-NEXT: addiu $4, $16, -64 +; MMR3-NEXT: lw $17, 0($sp) # 4-byte Folded Reload +; MMR3-NEXT: srlv $4, $17, $4 +; MMR3-NEXT: sw $4, 20($sp) # 4-byte Folded Spill +; MMR3-NEXT: lw $6, 12($sp) # 4-byte Folded Reload +; MMR3-NEXT: sll16 $4, $6, 1 +; MMR3-NEXT: sw $4, 8($sp) # 4-byte Folded Spill +; MMR3-NEXT: addiu $5, $16, -64 +; MMR3-NEXT: not16 $5, $5 +; MMR3-NEXT: sllv $5, $4, $5 +; MMR3-NEXT: or16 $2, $3 +; MMR3-NEXT: lw $3, 20($sp) # 4-byte Folded Reload +; MMR3-NEXT: or16 $5, $3 +; MMR3-NEXT: addiu $3, $16, -64 +; MMR3-NEXT: srav $1, $6, $3 +; MMR3-NEXT: andi16 $3, $3, 32 +; MMR3-NEXT: sw $3, 20($sp) # 4-byte Folded Spill +; MMR3-NEXT: movn $5, $1, $3 +; MMR3-NEXT: sllv $3, $6, $7 +; MMR3-NEXT: sw $3, 4($sp) # 4-byte Folded Spill +; MMR3-NEXT: not16 $3, $7 +; MMR3-NEXT: srl16 $4, $17, 1 +; MMR3-NEXT: srlv $3, $4, $3 ; MMR3-NEXT: sltiu $10, $16, 64 -; MMR3-NEXT: movn $5, $3, $10 -; MMR3-NEXT: or16 $6, $2 -; MMR3-NEXT: srlv $2, $7, $16 -; MMR3-NEXT: lw $3, 24($sp) # 4-byte Folded Reload -; MMR3-NEXT: lw $4, 4($sp) # 4-byte Folded Reload -; MMR3-NEXT: sllv $3, $4, $3 +; MMR3-NEXT: movn $5, $2, $10 +; MMR3-NEXT: lw $2, 4($sp) # 4-byte Folded Reload ; MMR3-NEXT: or16 $3, $2 -; MMR3-NEXT: srav $11, $17, $16 -; MMR3-NEXT: lw $4, 16($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $3, $11, $4 -; MMR3-NEXT: sra $2, $17, 31 +; MMR3-NEXT: srlv $2, $17, $16 +; MMR3-NEXT: lw $4, 24($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $7, 8($sp) # 4-byte Folded Reload +; MMR3-NEXT: sllv $17, $7, $4 +; MMR3-NEXT: or16 $17, $2 +; MMR3-NEXT: srav $11, $6, $16 +; MMR3-NEXT: lw $2, 36($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $17, $11, $2 +; MMR3-NEXT: sra $2, $6, 31 ; MMR3-NEXT: movz $5, $8, $16 -; MMR3-NEXT: move $8, $2 -; MMR3-NEXT: movn $8, $3, $10 -; MMR3-NEXT: lw $3, 28($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $6, $9, $3 -; MMR3-NEXT: li16 $3, 0 -; MMR3-NEXT: lw $7, 12($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $7, $3, $4 -; MMR3-NEXT: or16 $7, $6 +; MMR3-NEXT: move $4, $2 +; MMR3-NEXT: movn $4, $17, $10 +; MMR3-NEXT: lw $6, 28($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $3, $9, $6 +; MMR3-NEXT: lw $6, 36($sp) # 4-byte Folded Reload +; MMR3-NEXT: li16 $17, 0 +; MMR3-NEXT: lw $7, 16($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $7, $17, $6 +; MMR3-NEXT: or16 $7, $3 ; MMR3-NEXT: lw $3, 20($sp) # 4-byte Folded Reload ; MMR3-NEXT: movn $1, $2, $3 ; MMR3-NEXT: movn $1, $7, $10 ; MMR3-NEXT: lw $3, 32($sp) # 4-byte Folded Reload ; MMR3-NEXT: movz $1, $3, $16 -; MMR3-NEXT: movn $11, $2, $4 +; MMR3-NEXT: movn $11, $2, $6 ; MMR3-NEXT: movn $2, $11, $10 -; MMR3-NEXT: move $3, $8 +; MMR3-NEXT: move $3, $4 ; MMR3-NEXT: move $4, $1 ; MMR3-NEXT: lwp $16, 40($sp) ; MMR3-NEXT: addiusp 48 @@ -852,79 +858,80 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) { ; MMR6-NEXT: sw $16, 8($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 17, -4 ; MMR6-NEXT: .cfi_offset 16, -8 -; MMR6-NEXT: move $1, $7 +; MMR6-NEXT: move $12, $7 ; MMR6-NEXT: lw $3, 44($sp) ; MMR6-NEXT: li16 $2, 64 -; MMR6-NEXT: subu16 $7, $2, $3 -; MMR6-NEXT: sllv $8, $5, $7 -; MMR6-NEXT: andi16 $2, $7, 32 -; MMR6-NEXT: selnez $9, $8, $2 -; MMR6-NEXT: sllv $10, $4, $7 -; MMR6-NEXT: not16 $7, $7 -; MMR6-NEXT: srl16 $16, $5, 1 -; MMR6-NEXT: srlv $7, $16, $7 -; MMR6-NEXT: or $7, $10, $7 -; MMR6-NEXT: seleqz $7, $7, $2 -; MMR6-NEXT: or $7, $9, $7 -; MMR6-NEXT: srlv $9, $1, $3 -; MMR6-NEXT: not16 $16, $3 -; MMR6-NEXT: sw $16, 4($sp) # 4-byte Folded Spill +; MMR6-NEXT: subu16 $16, $2, $3 +; MMR6-NEXT: sllv $1, $5, $16 +; MMR6-NEXT: andi16 $2, $16, 32 +; MMR6-NEXT: selnez $8, $1, $2 +; MMR6-NEXT: sllv $9, $4, $16 +; MMR6-NEXT: not16 $16, $16 +; MMR6-NEXT: srl16 $17, $5, 1 +; MMR6-NEXT: srlv $10, $17, $16 +; MMR6-NEXT: or $9, $9, $10 +; MMR6-NEXT: seleqz $9, $9, $2 +; MMR6-NEXT: or $8, $8, $9 +; MMR6-NEXT: srlv $9, $7, $3 +; MMR6-NEXT: not16 $7, $3 +; MMR6-NEXT: sw $7, 4($sp) # 4-byte Folded Spill ; MMR6-NEXT: sll16 $17, $6, 1 -; MMR6-NEXT: sllv $10, $17, $16 +; MMR6-NEXT: sllv $10, $17, $7 ; MMR6-NEXT: or $9, $10, $9 ; MMR6-NEXT: andi16 $17, $3, 32 ; MMR6-NEXT: seleqz $9, $9, $17 ; MMR6-NEXT: srlv $10, $6, $3 ; MMR6-NEXT: selnez $11, $10, $17 ; MMR6-NEXT: seleqz $10, $10, $17 -; MMR6-NEXT: or $10, $10, $7 -; MMR6-NEXT: seleqz $12, $8, $2 -; MMR6-NEXT: or $8, $11, $9 +; MMR6-NEXT: or $8, $10, $8 +; MMR6-NEXT: seleqz $1, $1, $2 +; MMR6-NEXT: or $9, $11, $9 ; MMR6-NEXT: addiu $2, $3, -64 -; MMR6-NEXT: srlv $9, $5, $2 +; MMR6-NEXT: srlv $10, $5, $2 ; MMR6-NEXT: sll16 $7, $4, 1 ; MMR6-NEXT: not16 $16, $2 ; MMR6-NEXT: sllv $11, $7, $16 ; MMR6-NEXT: sltiu $13, $3, 64 -; MMR6-NEXT: or $8, $8, $12 -; MMR6-NEXT: selnez $10, $10, $13 -; MMR6-NEXT: or $9, $11, $9 -; MMR6-NEXT: srav $11, $4, $2 +; MMR6-NEXT: or $1, $9, $1 +; MMR6-NEXT: selnez $8, $8, $13 +; MMR6-NEXT: or $9, $11, $10 +; MMR6-NEXT: srav $10, $4, $2 ; MMR6-NEXT: andi16 $2, $2, 32 -; MMR6-NEXT: seleqz $12, $11, $2 +; MMR6-NEXT: seleqz $11, $10, $2 ; MMR6-NEXT: sra $14, $4, 31 ; MMR6-NEXT: selnez $15, $14, $2 ; MMR6-NEXT: seleqz $9, $9, $2 -; MMR6-NEXT: or $12, $15, $12 -; MMR6-NEXT: seleqz $12, $12, $13 -; MMR6-NEXT: selnez $2, $11, $2 -; MMR6-NEXT: seleqz $11, $14, $13 -; MMR6-NEXT: or $10, $10, $12 -; MMR6-NEXT: selnez $10, $10, $3 -; MMR6-NEXT: selnez $8, $8, $13 +; MMR6-NEXT: or $11, $15, $11 +; MMR6-NEXT: seleqz $11, $11, $13 +; MMR6-NEXT: selnez $2, $10, $2 +; MMR6-NEXT: seleqz $10, $14, $13 +; MMR6-NEXT: or $8, $8, $11 +; MMR6-NEXT: selnez $8, $8, $3 +; MMR6-NEXT: selnez $1, $1, $13 ; MMR6-NEXT: or $2, $2, $9 ; MMR6-NEXT: srav $9, $4, $3 ; MMR6-NEXT: seleqz $4, $9, $17 -; MMR6-NEXT: selnez $12, $14, $17 -; MMR6-NEXT: or $4, $12, $4 -; MMR6-NEXT: selnez $12, $4, $13 +; MMR6-NEXT: selnez $11, $14, $17 +; MMR6-NEXT: or $4, $11, $4 +; MMR6-NEXT: selnez $11, $4, $13 ; MMR6-NEXT: seleqz $2, $2, $13 ; MMR6-NEXT: seleqz $4, $6, $3 -; MMR6-NEXT: seleqz $1, $1, $3 -; MMR6-NEXT: or $2, $8, $2 -; MMR6-NEXT: selnez $2, $2, $3 +; MMR6-NEXT: seleqz $6, $12, $3 ; MMR6-NEXT: or $1, $1, $2 -; MMR6-NEXT: or $4, $4, $10 -; MMR6-NEXT: or $2, $12, $11 -; MMR6-NEXT: srlv $3, $5, $3 -; MMR6-NEXT: lw $5, 4($sp) # 4-byte Folded Reload -; MMR6-NEXT: sllv $5, $7, $5 -; MMR6-NEXT: or $3, $5, $3 -; MMR6-NEXT: seleqz $3, $3, $17 -; MMR6-NEXT: selnez $5, $9, $17 -; MMR6-NEXT: or $3, $5, $3 -; MMR6-NEXT: selnez $3, $3, $13 -; MMR6-NEXT: or $3, $3, $11 +; MMR6-NEXT: selnez $1, $1, $3 +; MMR6-NEXT: or $1, $6, $1 +; MMR6-NEXT: or $4, $4, $8 +; MMR6-NEXT: or $6, $11, $10 +; MMR6-NEXT: srlv $2, $5, $3 +; MMR6-NEXT: lw $3, 4($sp) # 4-byte Folded Reload +; MMR6-NEXT: sllv $3, $7, $3 +; MMR6-NEXT: or $2, $3, $2 +; MMR6-NEXT: seleqz $2, $2, $17 +; MMR6-NEXT: selnez $3, $9, $17 +; MMR6-NEXT: or $2, $3, $2 +; MMR6-NEXT: selnez $2, $2, $13 +; MMR6-NEXT: or $3, $2, $10 +; MMR6-NEXT: move $2, $6 ; MMR6-NEXT: move $5, $1 ; MMR6-NEXT: lw $16, 8($sp) # 4-byte Folded Reload ; MMR6-NEXT: lw $17, 12($sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll index ed2bfc9fcf600..e4b4b3ae1d0f1 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll @@ -776,76 +776,77 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) { ; MMR3-NEXT: .cfi_offset 17, -4 ; MMR3-NEXT: .cfi_offset 16, -8 ; MMR3-NEXT: move $8, $7 -; MMR3-NEXT: sw $6, 24($sp) # 4-byte Folded Spill +; MMR3-NEXT: sw $5, 4($sp) # 4-byte Folded Spill ; MMR3-NEXT: sw $4, 28($sp) # 4-byte Folded Spill ; MMR3-NEXT: lw $16, 68($sp) ; MMR3-NEXT: li16 $2, 64 -; MMR3-NEXT: subu16 $7, $2, $16 -; MMR3-NEXT: sllv $9, $5, $7 -; MMR3-NEXT: move $17, $5 -; MMR3-NEXT: sw $5, 0($sp) # 4-byte Folded Spill -; MMR3-NEXT: andi16 $3, $7, 32 +; MMR3-NEXT: subu16 $17, $2, $16 +; MMR3-NEXT: sllv $9, $5, $17 +; MMR3-NEXT: andi16 $3, $17, 32 ; MMR3-NEXT: sw $3, 20($sp) # 4-byte Folded Spill ; MMR3-NEXT: li16 $2, 0 ; MMR3-NEXT: move $4, $9 ; MMR3-NEXT: movn $4, $2, $3 -; MMR3-NEXT: srlv $5, $8, $16 +; MMR3-NEXT: srlv $5, $7, $16 ; MMR3-NEXT: not16 $3, $16 ; MMR3-NEXT: sw $3, 16($sp) # 4-byte Folded Spill ; MMR3-NEXT: sll16 $2, $6, 1 +; MMR3-NEXT: sw $6, 24($sp) # 4-byte Folded Spill ; MMR3-NEXT: sllv $2, $2, $3 ; MMR3-NEXT: or16 $2, $5 -; MMR3-NEXT: srlv $5, $6, $16 -; MMR3-NEXT: sw $5, 4($sp) # 4-byte Folded Spill +; MMR3-NEXT: srlv $7, $6, $16 ; MMR3-NEXT: andi16 $3, $16, 32 ; MMR3-NEXT: sw $3, 12($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $2, $5, $3 +; MMR3-NEXT: movn $2, $7, $3 ; MMR3-NEXT: addiu $3, $16, -64 ; MMR3-NEXT: or16 $2, $4 -; MMR3-NEXT: srlv $4, $17, $3 -; MMR3-NEXT: sw $4, 8($sp) # 4-byte Folded Spill -; MMR3-NEXT: lw $4, 28($sp) # 4-byte Folded Reload -; MMR3-NEXT: sll16 $6, $4, 1 -; MMR3-NEXT: not16 $5, $3 -; MMR3-NEXT: sllv $5, $6, $5 -; MMR3-NEXT: lw $17, 8($sp) # 4-byte Folded Reload -; MMR3-NEXT: or16 $5, $17 -; MMR3-NEXT: srlv $1, $4, $3 -; MMR3-NEXT: andi16 $3, $3, 32 +; MMR3-NEXT: lw $6, 4($sp) # 4-byte Folded Reload +; MMR3-NEXT: srlv $3, $6, $3 ; MMR3-NEXT: sw $3, 8($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $5, $1, $3 +; MMR3-NEXT: lw $3, 28($sp) # 4-byte Folded Reload +; MMR3-NEXT: sll16 $4, $3, 1 +; MMR3-NEXT: sw $4, 0($sp) # 4-byte Folded Spill +; MMR3-NEXT: addiu $5, $16, -64 +; MMR3-NEXT: not16 $5, $5 +; MMR3-NEXT: sllv $5, $4, $5 +; MMR3-NEXT: lw $4, 8($sp) # 4-byte Folded Reload +; MMR3-NEXT: or16 $5, $4 +; MMR3-NEXT: addiu $4, $16, -64 +; MMR3-NEXT: srlv $1, $3, $4 +; MMR3-NEXT: andi16 $4, $4, 32 +; MMR3-NEXT: sw $4, 8($sp) # 4-byte Folded Spill +; MMR3-NEXT: movn $5, $1, $4 ; MMR3-NEXT: sltiu $10, $16, 64 ; MMR3-NEXT: movn $5, $2, $10 -; MMR3-NEXT: sllv $2, $4, $7 -; MMR3-NEXT: not16 $3, $7 -; MMR3-NEXT: lw $7, 0($sp) # 4-byte Folded Reload -; MMR3-NEXT: srl16 $4, $7, 1 +; MMR3-NEXT: sllv $2, $3, $17 +; MMR3-NEXT: not16 $3, $17 +; MMR3-NEXT: srl16 $4, $6, 1 ; MMR3-NEXT: srlv $4, $4, $3 ; MMR3-NEXT: or16 $4, $2 -; MMR3-NEXT: srlv $2, $7, $16 +; MMR3-NEXT: srlv $2, $6, $16 ; MMR3-NEXT: lw $3, 16($sp) # 4-byte Folded Reload +; MMR3-NEXT: lw $6, 0($sp) # 4-byte Folded Reload ; MMR3-NEXT: sllv $3, $6, $3 ; MMR3-NEXT: or16 $3, $2 ; MMR3-NEXT: lw $2, 28($sp) # 4-byte Folded Reload ; MMR3-NEXT: srlv $2, $2, $16 -; MMR3-NEXT: lw $17, 12($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $3, $2, $17 +; MMR3-NEXT: lw $6, 12($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $3, $2, $6 ; MMR3-NEXT: movz $5, $8, $16 -; MMR3-NEXT: li16 $6, 0 -; MMR3-NEXT: movz $3, $6, $10 -; MMR3-NEXT: lw $7, 20($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $4, $9, $7 -; MMR3-NEXT: lw $6, 4($sp) # 4-byte Folded Reload -; MMR3-NEXT: li16 $7, 0 -; MMR3-NEXT: movn $6, $7, $17 -; MMR3-NEXT: or16 $6, $4 +; MMR3-NEXT: li16 $17, 0 +; MMR3-NEXT: movz $3, $17, $10 +; MMR3-NEXT: lw $17, 20($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $4, $9, $17 +; MMR3-NEXT: li16 $17, 0 +; MMR3-NEXT: movn $7, $17, $6 +; MMR3-NEXT: or16 $7, $4 ; MMR3-NEXT: lw $4, 8($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $1, $7, $4 -; MMR3-NEXT: li16 $7, 0 -; MMR3-NEXT: movn $1, $6, $10 +; MMR3-NEXT: movn $1, $17, $4 +; MMR3-NEXT: li16 $17, 0 +; MMR3-NEXT: movn $1, $7, $10 ; MMR3-NEXT: lw $4, 24($sp) # 4-byte Folded Reload ; MMR3-NEXT: movz $1, $4, $16 -; MMR3-NEXT: movn $2, $7, $17 +; MMR3-NEXT: movn $2, $17, $6 ; MMR3-NEXT: li16 $4, 0 ; MMR3-NEXT: movz $2, $4, $10 ; MMR3-NEXT: move $4, $1 @@ -855,98 +856,91 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) { ; ; MMR6-LABEL: lshr_i128: ; MMR6: # %bb.0: # %entry -; MMR6-NEXT: addiu $sp, $sp, -32 -; MMR6-NEXT: .cfi_def_cfa_offset 32 -; MMR6-NEXT: sw $17, 28($sp) # 4-byte Folded Spill -; MMR6-NEXT: sw $16, 24($sp) # 4-byte Folded Spill +; MMR6-NEXT: addiu $sp, $sp, -24 +; MMR6-NEXT: .cfi_def_cfa_offset 24 +; MMR6-NEXT: sw $17, 20($sp) # 4-byte Folded Spill +; MMR6-NEXT: sw $16, 16($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 17, -4 ; MMR6-NEXT: .cfi_offset 16, -8 ; MMR6-NEXT: move $1, $7 -; MMR6-NEXT: move $7, $5 -; MMR6-NEXT: lw $3, 60($sp) +; MMR6-NEXT: move $7, $4 +; MMR6-NEXT: lw $3, 52($sp) ; MMR6-NEXT: srlv $2, $1, $3 -; MMR6-NEXT: not16 $5, $3 -; MMR6-NEXT: sw $5, 12($sp) # 4-byte Folded Spill -; MMR6-NEXT: move $17, $6 -; MMR6-NEXT: sw $6, 16($sp) # 4-byte Folded Spill +; MMR6-NEXT: not16 $16, $3 +; MMR6-NEXT: sw $16, 8($sp) # 4-byte Folded Spill +; MMR6-NEXT: move $4, $6 +; MMR6-NEXT: sw $6, 12($sp) # 4-byte Folded Spill ; MMR6-NEXT: sll16 $6, $6, 1 -; MMR6-NEXT: sllv $6, $6, $5 +; MMR6-NEXT: sllv $6, $6, $16 ; MMR6-NEXT: or $8, $6, $2 -; MMR6-NEXT: addiu $5, $3, -64 -; MMR6-NEXT: srlv $9, $7, $5 -; MMR6-NEXT: move $6, $4 -; MMR6-NEXT: sll16 $2, $4, 1 -; MMR6-NEXT: sw $2, 8($sp) # 4-byte Folded Spill -; MMR6-NEXT: not16 $16, $5 +; MMR6-NEXT: addiu $6, $3, -64 +; MMR6-NEXT: srlv $9, $5, $6 +; MMR6-NEXT: sll16 $2, $7, 1 +; MMR6-NEXT: sw $2, 4($sp) # 4-byte Folded Spill +; MMR6-NEXT: not16 $16, $6 ; MMR6-NEXT: sllv $10, $2, $16 ; MMR6-NEXT: andi16 $16, $3, 32 ; MMR6-NEXT: seleqz $8, $8, $16 ; MMR6-NEXT: or $9, $10, $9 -; MMR6-NEXT: srlv $10, $17, $3 +; MMR6-NEXT: srlv $10, $4, $3 ; MMR6-NEXT: selnez $11, $10, $16 ; MMR6-NEXT: li16 $17, 64 ; MMR6-NEXT: subu16 $2, $17, $3 -; MMR6-NEXT: sllv $12, $7, $2 -; MMR6-NEXT: move $17, $7 +; MMR6-NEXT: sllv $12, $5, $2 ; MMR6-NEXT: andi16 $4, $2, 32 -; MMR6-NEXT: andi16 $7, $5, 32 -; MMR6-NEXT: sw $7, 20($sp) # 4-byte Folded Spill -; MMR6-NEXT: seleqz $9, $9, $7 +; MMR6-NEXT: andi16 $17, $6, 32 +; MMR6-NEXT: seleqz $9, $9, $17 ; MMR6-NEXT: seleqz $13, $12, $4 ; MMR6-NEXT: or $8, $11, $8 ; MMR6-NEXT: selnez $11, $12, $4 -; MMR6-NEXT: sllv $12, $6, $2 -; MMR6-NEXT: move $7, $6 -; MMR6-NEXT: sw $6, 4($sp) # 4-byte Folded Spill +; MMR6-NEXT: sllv $12, $7, $2 ; MMR6-NEXT: not16 $2, $2 -; MMR6-NEXT: srl16 $6, $17, 1 +; MMR6-NEXT: srl16 $6, $5, 1 ; MMR6-NEXT: srlv $2, $6, $2 ; MMR6-NEXT: or $2, $12, $2 ; MMR6-NEXT: seleqz $2, $2, $4 -; MMR6-NEXT: srlv $4, $7, $5 -; MMR6-NEXT: or $11, $11, $2 -; MMR6-NEXT: or $5, $8, $13 -; MMR6-NEXT: srlv $6, $17, $3 -; MMR6-NEXT: lw $2, 20($sp) # 4-byte Folded Reload -; MMR6-NEXT: selnez $7, $4, $2 -; MMR6-NEXT: sltiu $8, $3, 64 -; MMR6-NEXT: selnez $12, $5, $8 -; MMR6-NEXT: or $7, $7, $9 -; MMR6-NEXT: lw $5, 12($sp) # 4-byte Folded Reload +; MMR6-NEXT: addiu $4, $3, -64 +; MMR6-NEXT: srlv $4, $7, $4 +; MMR6-NEXT: or $12, $11, $2 +; MMR6-NEXT: or $6, $8, $13 +; MMR6-NEXT: srlv $5, $5, $3 +; MMR6-NEXT: selnez $8, $4, $17 +; MMR6-NEXT: sltiu $11, $3, 64 +; MMR6-NEXT: selnez $13, $6, $11 +; MMR6-NEXT: or $8, $8, $9 ; MMR6-NEXT: lw $2, 8($sp) # 4-byte Folded Reload -; MMR6-NEXT: sllv $9, $2, $5 +; MMR6-NEXT: lw $6, 4($sp) # 4-byte Folded Reload +; MMR6-NEXT: sllv $9, $6, $2 ; MMR6-NEXT: seleqz $10, $10, $16 -; MMR6-NEXT: li16 $5, 0 -; MMR6-NEXT: or $10, $10, $11 -; MMR6-NEXT: or $6, $9, $6 -; MMR6-NEXT: seleqz $2, $7, $8 -; MMR6-NEXT: seleqz $7, $5, $8 -; MMR6-NEXT: lw $5, 4($sp) # 4-byte Folded Reload -; MMR6-NEXT: srlv $9, $5, $3 -; MMR6-NEXT: seleqz $11, $9, $16 -; MMR6-NEXT: selnez $11, $11, $8 +; MMR6-NEXT: li16 $2, 0 +; MMR6-NEXT: or $10, $10, $12 +; MMR6-NEXT: or $9, $9, $5 +; MMR6-NEXT: seleqz $5, $8, $11 +; MMR6-NEXT: seleqz $8, $2, $11 +; MMR6-NEXT: srlv $7, $7, $3 +; MMR6-NEXT: seleqz $2, $7, $16 +; MMR6-NEXT: selnez $2, $2, $11 ; MMR6-NEXT: seleqz $1, $1, $3 -; MMR6-NEXT: or $2, $12, $2 -; MMR6-NEXT: selnez $2, $2, $3 -; MMR6-NEXT: or $5, $1, $2 -; MMR6-NEXT: or $2, $7, $11 -; MMR6-NEXT: seleqz $1, $6, $16 -; MMR6-NEXT: selnez $6, $9, $16 -; MMR6-NEXT: lw $16, 16($sp) # 4-byte Folded Reload -; MMR6-NEXT: seleqz $9, $16, $3 -; MMR6-NEXT: selnez $10, $10, $8 -; MMR6-NEXT: lw $16, 20($sp) # 4-byte Folded Reload -; MMR6-NEXT: seleqz $4, $4, $16 -; MMR6-NEXT: seleqz $4, $4, $8 -; MMR6-NEXT: or $4, $10, $4 +; MMR6-NEXT: or $5, $13, $5 +; MMR6-NEXT: selnez $5, $5, $3 +; MMR6-NEXT: or $5, $1, $5 +; MMR6-NEXT: or $2, $8, $2 +; MMR6-NEXT: seleqz $1, $9, $16 +; MMR6-NEXT: selnez $6, $7, $16 +; MMR6-NEXT: lw $7, 12($sp) # 4-byte Folded Reload +; MMR6-NEXT: seleqz $7, $7, $3 +; MMR6-NEXT: selnez $9, $10, $11 +; MMR6-NEXT: seleqz $4, $4, $17 +; MMR6-NEXT: seleqz $4, $4, $11 +; MMR6-NEXT: or $4, $9, $4 ; MMR6-NEXT: selnez $3, $4, $3 -; MMR6-NEXT: or $4, $9, $3 +; MMR6-NEXT: or $4, $7, $3 ; MMR6-NEXT: or $1, $6, $1 -; MMR6-NEXT: selnez $1, $1, $8 -; MMR6-NEXT: or $3, $7, $1 -; MMR6-NEXT: lw $16, 24($sp) # 4-byte Folded Reload -; MMR6-NEXT: lw $17, 28($sp) # 4-byte Folded Reload -; MMR6-NEXT: addiu $sp, $sp, 32 +; MMR6-NEXT: selnez $1, $1, $11 +; MMR6-NEXT: or $3, $8, $1 +; MMR6-NEXT: lw $16, 16($sp) # 4-byte Folded Reload +; MMR6-NEXT: lw $17, 20($sp) # 4-byte Folded Reload +; MMR6-NEXT: addiu $sp, $sp, 24 ; MMR6-NEXT: jrc $ra entry: diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll index a8d829bef1d49..5050cf40332ba 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll @@ -849,77 +849,78 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) { ; MMR3-NEXT: swp $16, 32($sp) ; MMR3-NEXT: .cfi_offset 17, -4 ; MMR3-NEXT: .cfi_offset 16, -8 -; MMR3-NEXT: move $17, $7 -; MMR3-NEXT: sw $7, 4($sp) # 4-byte Folded Spill -; MMR3-NEXT: move $7, $6 +; MMR3-NEXT: sw $7, 8($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $17, $6 +; MMR3-NEXT: sw $5, 28($sp) # 4-byte Folded Spill ; MMR3-NEXT: move $1, $4 ; MMR3-NEXT: lw $16, 68($sp) ; MMR3-NEXT: li16 $2, 64 ; MMR3-NEXT: subu16 $6, $2, $16 -; MMR3-NEXT: srlv $9, $7, $6 -; MMR3-NEXT: andi16 $4, $6, 32 -; MMR3-NEXT: sw $4, 24($sp) # 4-byte Folded Spill +; MMR3-NEXT: srlv $9, $17, $6 +; MMR3-NEXT: andi16 $7, $6, 32 +; MMR3-NEXT: sw $7, 24($sp) # 4-byte Folded Spill ; MMR3-NEXT: li16 $3, 0 -; MMR3-NEXT: move $2, $9 -; MMR3-NEXT: movn $2, $3, $4 -; MMR3-NEXT: sllv $3, $1, $16 -; MMR3-NEXT: sw $3, 16($sp) # 4-byte Folded Spill -; MMR3-NEXT: not16 $4, $16 -; MMR3-NEXT: sw $4, 20($sp) # 4-byte Folded Spill -; MMR3-NEXT: sw $5, 28($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $4, $9 +; MMR3-NEXT: movn $4, $3, $7 +; MMR3-NEXT: sllv $7, $1, $16 +; MMR3-NEXT: not16 $2, $16 +; MMR3-NEXT: sw $2, 20($sp) # 4-byte Folded Spill ; MMR3-NEXT: srl16 $3, $5, 1 -; MMR3-NEXT: srlv $3, $3, $4 -; MMR3-NEXT: lw $4, 16($sp) # 4-byte Folded Reload -; MMR3-NEXT: or16 $3, $4 +; MMR3-NEXT: srlv $3, $3, $2 +; MMR3-NEXT: or16 $3, $7 ; MMR3-NEXT: sllv $5, $5, $16 -; MMR3-NEXT: sw $5, 8($sp) # 4-byte Folded Spill -; MMR3-NEXT: andi16 $4, $16, 32 -; MMR3-NEXT: sw $4, 16($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $3, $5, $4 -; MMR3-NEXT: addiu $4, $16, -64 -; MMR3-NEXT: or16 $3, $2 -; MMR3-NEXT: sllv $2, $7, $4 +; MMR3-NEXT: sw $5, 4($sp) # 4-byte Folded Spill +; MMR3-NEXT: andi16 $2, $16, 32 +; MMR3-NEXT: sw $2, 16($sp) # 4-byte Folded Spill +; MMR3-NEXT: movn $3, $5, $2 +; MMR3-NEXT: addiu $7, $16, -64 +; MMR3-NEXT: or16 $3, $4 +; MMR3-NEXT: sllv $2, $17, $7 ; MMR3-NEXT: sw $2, 12($sp) # 4-byte Folded Spill -; MMR3-NEXT: srl16 $5, $17, 1 -; MMR3-NEXT: not16 $2, $4 +; MMR3-NEXT: lw $4, 8($sp) # 4-byte Folded Reload +; MMR3-NEXT: srl16 $5, $4, 1 +; MMR3-NEXT: not16 $2, $7 ; MMR3-NEXT: srlv $2, $5, $2 -; MMR3-NEXT: lw $17, 12($sp) # 4-byte Folded Reload -; MMR3-NEXT: or16 $2, $17 -; MMR3-NEXT: lw $17, 4($sp) # 4-byte Folded Reload -; MMR3-NEXT: sllv $8, $17, $4 -; MMR3-NEXT: andi16 $4, $4, 32 -; MMR3-NEXT: sw $4, 12($sp) # 4-byte Folded Spill -; MMR3-NEXT: movn $2, $8, $4 +; MMR3-NEXT: lw $7, 12($sp) # 4-byte Folded Reload +; MMR3-NEXT: or16 $2, $7 +; MMR3-NEXT: addiu $7, $16, -64 +; MMR3-NEXT: sllv $8, $4, $7 +; MMR3-NEXT: andi16 $7, $7, 32 +; MMR3-NEXT: sw $7, 12($sp) # 4-byte Folded Spill +; MMR3-NEXT: movn $2, $8, $7 ; MMR3-NEXT: sltiu $10, $16, 64 ; MMR3-NEXT: movn $2, $3, $10 -; MMR3-NEXT: srlv $4, $17, $6 +; MMR3-NEXT: srlv $3, $4, $6 +; MMR3-NEXT: sw $3, 0($sp) # 4-byte Folded Spill +; MMR3-NEXT: move $7, $4 ; MMR3-NEXT: not16 $3, $6 -; MMR3-NEXT: sll16 $6, $7, 1 -; MMR3-NEXT: sllv $3, $6, $3 +; MMR3-NEXT: sll16 $4, $17, 1 +; MMR3-NEXT: sllv $3, $4, $3 +; MMR3-NEXT: lw $4, 0($sp) # 4-byte Folded Reload ; MMR3-NEXT: or16 $3, $4 -; MMR3-NEXT: sllv $6, $7, $16 +; MMR3-NEXT: sllv $6, $17, $16 ; MMR3-NEXT: lw $4, 20($sp) # 4-byte Folded Reload ; MMR3-NEXT: srlv $4, $5, $4 ; MMR3-NEXT: or16 $4, $6 -; MMR3-NEXT: sllv $6, $17, $16 -; MMR3-NEXT: lw $17, 16($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $4, $6, $17 +; MMR3-NEXT: sllv $6, $7, $16 +; MMR3-NEXT: lw $7, 16($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $4, $6, $7 ; MMR3-NEXT: movz $2, $1, $16 ; MMR3-NEXT: li16 $5, 0 ; MMR3-NEXT: movz $4, $5, $10 -; MMR3-NEXT: lw $7, 24($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $3, $9, $7 -; MMR3-NEXT: lw $5, 8($sp) # 4-byte Folded Reload -; MMR3-NEXT: li16 $7, 0 -; MMR3-NEXT: movn $5, $7, $17 +; MMR3-NEXT: lw $17, 24($sp) # 4-byte Folded Reload +; MMR3-NEXT: movn $3, $9, $17 +; MMR3-NEXT: lw $5, 4($sp) # 4-byte Folded Reload +; MMR3-NEXT: li16 $17, 0 +; MMR3-NEXT: movn $5, $17, $7 ; MMR3-NEXT: or16 $5, $3 ; MMR3-NEXT: lw $3, 12($sp) # 4-byte Folded Reload -; MMR3-NEXT: movn $8, $7, $3 -; MMR3-NEXT: li16 $7, 0 +; MMR3-NEXT: movn $8, $17, $3 +; MMR3-NEXT: li16 $17, 0 ; MMR3-NEXT: movn $8, $5, $10 ; MMR3-NEXT: lw $3, 28($sp) # 4-byte Folded Reload ; MMR3-NEXT: movz $8, $3, $16 -; MMR3-NEXT: movn $6, $7, $17 +; MMR3-NEXT: movn $6, $17, $7 ; MMR3-NEXT: li16 $3, 0 ; MMR3-NEXT: movz $6, $3, $10 ; MMR3-NEXT: move $3, $8 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/sub.ll b/llvm/test/CodeGen/Mips/llvm-ir/sub.ll index 51dcccefc84d3..bc9ce4420332e 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/sub.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/sub.ll @@ -162,35 +162,32 @@ entry: ; MMR3: lw $[[T20:[0-9]+]], 0($sp) ; MMR3: subu16 $5, $[[T19]], $[[T20]] -; MMR6: move $[[T0:[0-9]+]], $7 -; MMR6: sw $7, 8($sp) -; MMR6: move $[[T1:[0-9]+]], $5 -; MMR6: sw $4, 12($sp) +; MMR6: sw $7, 4($sp) +; MMR6: sw $4, 8($sp) ; MMR6: lw $[[T2:[0-9]+]], 48($sp) ; MMR6: sltu $[[T3:[0-9]+]], $6, $[[T2]] ; MMR6: xor $[[T4:[0-9]+]], $6, $[[T2]] ; MMR6: sltiu $[[T5:[0-9]+]], $[[T4]], 1 ; MMR6: seleqz $[[T6:[0-9]+]], $[[T3]], $[[T5]] ; MMR6: lw $[[T7:[0-9]+]], 52($sp) -; MMR6: sltu $[[T8:[0-9]+]], $[[T0]], $[[T7]] +; MMR6: sltu $[[T8:[0-9]+]], $7, $[[T7]] ; MMR6: selnez $[[T9:[0-9]+]], $[[T8]], $[[T5]] ; MMR6: or $[[T10:[0-9]+]], $[[T9]], $[[T6]] ; MMR6: lw $[[T11:[0-9]+]], 44($sp) -; MMR6: subu16 $[[T12:[0-9]+]], $[[T1]], $[[T11]] -; MMR6: subu16 $[[T13:[0-9]+]], $[[T12]], $[[T7]] -; MMR6: sltu $[[T16:[0-9]+]], $[[T12]], $[[T7]] -; MMR6: sltu $[[T17:[0-9]+]], $[[T1]], $[[T11]] -; MMR6: lw $[[T18:[0-9]+]], 40($sp) -; MMR6: lw $[[T19:[0-9]+]], 12($sp) -; MMR6: subu16 $[[T20:[0-9]+]], $[[T19]], $[[T18]] +; MMR6: subu16 $[[T12:[0-9]+]], $5, $[[T11]] +; MMR6: lw $[[T1:[0-9]+]], 12($sp) +; MMR6: subu16 $[[T13:[0-9]+]], $[[T12]], $[[T1]] +; MMR6: sltu $[[T16:[0-9]+]], $[[T12]], $[[T1]] +; MMR6: sltu $[[T17:[0-9]+]], $5, $[[T11]] +; MMR6: lw $[[T19:[0-9]+]], 8($sp) +; MMR6: subu16 $[[T20:[0-9]+]], $[[T19]], $5 ; MMR6: subu16 $[[T21:[0-9]+]], $[[T20]], $[[T17]] ; MMR6: subu16 $[[T22:[0-9]+]], $[[T21]], $[[T16]] ; MMR6: subu16 $[[T23:[0-9]+]], $6, $[[T2]] -; MMR6: subu16 $4, $[[T23]], $5 -; MMR6: lw $[[T24:[0-9]+]], 8($sp) -; MMR6: lw $[[T25:[0-9]+]], 0($sp) -; MMR6: subu16 $5, $[[T24]], $[[T25]] -; MMR6: lw $3, 4($sp) +; MMR6: subu16 $4, $[[T23]], $[[T8]] +; MMR6: lw $[[T24:[0-9]+]], 4($sp) +; MMR6: subu16 $5, $[[T24]], $[[T7]] +; MMR6: lw $3, 0($sp) ; FIXME: The sltu, dsll, dsrl pattern here occurs when an i32 is zero ; extended to 64 bits. Fortunately slt(i)(u) actually gives an i1. diff --git a/llvm/test/CodeGen/Mips/tls.ll b/llvm/test/CodeGen/Mips/tls.ll index 4ef885e8fb06a..39bd85603d27b 100644 --- a/llvm/test/CodeGen/Mips/tls.ll +++ b/llvm/test/CodeGen/Mips/tls.ll @@ -71,8 +71,8 @@ define dso_preemptable i32 @f3() nounwind { entry: ; PIC32-LABEL: f3: ; PIC32: addu $[[R0:[a-z0-9]+]], $2, $25 -; PIC32: addiu $4, $[[R0]], %tlsldm(f3.i) ; PIC32: lw $25, %call16(__tls_get_addr)($[[R0]]) +; PIC32: addiu $4, $[[R0]], %tlsldm(f3.i) ; PIC32: jalr $25 ; PIC32: lui $[[R0:[0-9]+]], %dtprel_hi(f3.i) ; PIC32: addu $[[R1:[0-9]+]], $[[R0]], $2 @@ -84,8 +84,8 @@ entry: ; PIC64: lui $[[R0:[a-z0-9]+]], %hi(%neg(%gp_rel(f3))) ; PIC64: daddu $[[R0]], $[[R0]], $25 ; PIC64: daddiu $[[R1:[a-z0-9]+]], $[[R0]], %lo(%neg(%gp_rel(f3))) -; PIC64: daddiu $4, $[[R1]], %tlsldm(f3.i) ; PIC64: ld $25, %call16(__tls_get_addr)($[[R1]]) +; PIC64: daddiu $4, $[[R1]], %tlsldm(f3.i) ; PIC64: jalr $25 ; PIC64: lui $[[R0:[0-9]+]], %dtprel_hi(f3.i) ; PIC64: daddu $[[R1:[0-9]+]], $[[R0]], $2 diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll index c623844646a06..5c30d19c92618 100644 --- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll +++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll @@ -8388,17 +8388,17 @@ define i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s0, a0, -1 -; RV32I-NEXT: and s1, s2, s0 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and s0, s2, s3 ; RV32I-NEXT: j .LBB100_2 ; RV32I-NEXT: .LBB100_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB100_2 Depth=1 ; RV32I-NEXT: sh a1, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2@plt @@ -8406,9 +8406,9 @@ define i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: bnez a0, .LBB100_4 ; RV32I-NEXT: .LBB100_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s0 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s1, a0, .LBB100_1 +; RV32I-NEXT: bltu s0, a0, .LBB100_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB100_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -8530,11 +8530,11 @@ define i16 @atomicrmw_umax_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s0, a0, -1 -; RV32I-NEXT: and s1, s2, s0 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and s0, s2, s3 ; RV32I-NEXT: j .LBB101_2 ; RV32I-NEXT: .LBB101_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB101_2 Depth=1 @@ -8542,15 +8542,15 @@ define i16 @atomicrmw_umax_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB101_4 ; RV32I-NEXT: .LBB101_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s0 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s1, a0, .LBB101_1 +; RV32I-NEXT: bltu s0, a0, .LBB101_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB101_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -8672,27 +8672,27 @@ define i16 @atomicrmw_umax_i16_release(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s0, a0, -1 -; RV32I-NEXT: and s1, s2, s0 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and s0, s2, s3 ; RV32I-NEXT: j .LBB102_2 ; RV32I-NEXT: .LBB102_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB102_2 Depth=1 ; RV32I-NEXT: sh a1, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 3 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB102_4 ; RV32I-NEXT: .LBB102_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s0 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s1, a0, .LBB102_1 +; RV32I-NEXT: bltu s0, a0, .LBB102_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB102_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -8814,11 +8814,11 @@ define i16 @atomicrmw_umax_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s0, a0, -1 -; RV32I-NEXT: and s1, s2, s0 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and s0, s2, s3 ; RV32I-NEXT: j .LBB103_2 ; RV32I-NEXT: .LBB103_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB103_2 Depth=1 @@ -8826,15 +8826,15 @@ define i16 @atomicrmw_umax_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB103_4 ; RV32I-NEXT: .LBB103_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s0 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s1, a0, .LBB103_1 +; RV32I-NEXT: bltu s0, a0, .LBB103_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB103_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -8956,11 +8956,11 @@ define i16 @atomicrmw_umax_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s0, a0, -1 -; RV32I-NEXT: and s1, s2, s0 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and s0, s2, s3 ; RV32I-NEXT: j .LBB104_2 ; RV32I-NEXT: .LBB104_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB104_2 Depth=1 @@ -8968,15 +8968,15 @@ define i16 @atomicrmw_umax_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB104_4 ; RV32I-NEXT: .LBB104_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s0 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s1, a0, .LBB104_1 +; RV32I-NEXT: bltu s0, a0, .LBB104_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB104_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -9098,17 +9098,17 @@ define i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s0, a0, -1 -; RV32I-NEXT: and s1, s2, s0 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and s0, s2, s3 ; RV32I-NEXT: j .LBB105_2 ; RV32I-NEXT: .LBB105_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB105_2 Depth=1 ; RV32I-NEXT: sh a1, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2@plt @@ -9116,9 +9116,9 @@ define i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: bnez a0, .LBB105_4 ; RV32I-NEXT: .LBB105_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s0 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s1, a0, .LBB105_1 +; RV32I-NEXT: bgeu s0, a0, .LBB105_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB105_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -9240,11 +9240,11 @@ define i16 @atomicrmw_umin_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s0, a0, -1 -; RV32I-NEXT: and s1, s2, s0 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and s0, s2, s3 ; RV32I-NEXT: j .LBB106_2 ; RV32I-NEXT: .LBB106_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB106_2 Depth=1 @@ -9252,15 +9252,15 @@ define i16 @atomicrmw_umin_i16_acquire(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 2 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB106_4 ; RV32I-NEXT: .LBB106_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s0 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s1, a0, .LBB106_1 +; RV32I-NEXT: bgeu s0, a0, .LBB106_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB106_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -9382,27 +9382,27 @@ define i16 @atomicrmw_umin_i16_release(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s0, a0, -1 -; RV32I-NEXT: and s1, s2, s0 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and s0, s2, s3 ; RV32I-NEXT: j .LBB107_2 ; RV32I-NEXT: .LBB107_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB107_2 Depth=1 ; RV32I-NEXT: sh a1, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 3 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB107_4 ; RV32I-NEXT: .LBB107_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s0 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s1, a0, .LBB107_1 +; RV32I-NEXT: bgeu s0, a0, .LBB107_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB107_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -9524,11 +9524,11 @@ define i16 @atomicrmw_umin_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s0, a0, -1 -; RV32I-NEXT: and s1, s2, s0 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and s0, s2, s3 ; RV32I-NEXT: j .LBB108_2 ; RV32I-NEXT: .LBB108_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB108_2 Depth=1 @@ -9536,15 +9536,15 @@ define i16 @atomicrmw_umin_i16_acq_rel(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 4 ; RV32I-NEXT: addi a4, zero, 2 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB108_4 ; RV32I-NEXT: .LBB108_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s0 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s1, a0, .LBB108_1 +; RV32I-NEXT: bgeu s0, a0, .LBB108_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB108_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -9666,11 +9666,11 @@ define i16 @atomicrmw_umin_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s0, a0, -1 -; RV32I-NEXT: and s1, s2, s0 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and s0, s2, s3 ; RV32I-NEXT: j .LBB109_2 ; RV32I-NEXT: .LBB109_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB109_2 Depth=1 @@ -9678,15 +9678,15 @@ define i16 @atomicrmw_umin_i16_seq_cst(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: addi a1, sp, 10 ; RV32I-NEXT: addi a3, zero, 5 ; RV32I-NEXT: addi a4, zero, 5 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __atomic_compare_exchange_2@plt ; RV32I-NEXT: lh a1, 10(sp) ; RV32I-NEXT: bnez a0, .LBB109_4 ; RV32I-NEXT: .LBB109_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s0 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s1, a0, .LBB109_1 +; RV32I-NEXT: bgeu s0, a0, .LBB109_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB109_2 Depth=1 ; RV32I-NEXT: mv a2, s2 diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll index dee847c7cdaee..fcbb7bb3c435e 100644 --- a/llvm/test/CodeGen/RISCV/atomic-signext.ll +++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll @@ -1952,17 +1952,17 @@ define signext i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s0, a0, -1 -; RV32I-NEXT: and s1, s2, s0 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and s0, s2, s3 ; RV32I-NEXT: j .LBB23_2 ; RV32I-NEXT: .LBB23_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB23_2 Depth=1 ; RV32I-NEXT: sh a1, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2@plt @@ -1970,9 +1970,9 @@ define signext i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: bnez a0, .LBB23_4 ; RV32I-NEXT: .LBB23_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s0 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bltu s1, a0, .LBB23_1 +; RV32I-NEXT: bltu s0, a0, .LBB23_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB23_2 Depth=1 ; RV32I-NEXT: mv a2, s2 @@ -2100,17 +2100,17 @@ define signext i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s2, a1 -; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lhu a1, 0(a0) ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s0, a0, -1 -; RV32I-NEXT: and s1, s2, s0 +; RV32I-NEXT: addi s3, a0, -1 +; RV32I-NEXT: and s0, s2, s3 ; RV32I-NEXT: j .LBB24_2 ; RV32I-NEXT: .LBB24_1: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB24_2 Depth=1 ; RV32I-NEXT: sh a1, 10(sp) ; RV32I-NEXT: addi a1, sp, 10 -; RV32I-NEXT: mv a0, s3 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: mv a3, zero ; RV32I-NEXT: mv a4, zero ; RV32I-NEXT: call __atomic_compare_exchange_2@plt @@ -2118,9 +2118,9 @@ define signext i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind { ; RV32I-NEXT: bnez a0, .LBB24_4 ; RV32I-NEXT: .LBB24_2: # %atomicrmw.start ; RV32I-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32I-NEXT: and a0, a1, s0 +; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: mv a2, a1 -; RV32I-NEXT: bgeu s1, a0, .LBB24_1 +; RV32I-NEXT: bgeu s0, a0, .LBB24_1 ; RV32I-NEXT: # %bb.3: # %atomicrmw.start ; RV32I-NEXT: # in Loop: Header=BB24_2 Depth=1 ; RV32I-NEXT: mv a2, s2 diff --git a/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll index 1921e236837ad..588244003411b 100644 --- a/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/bswap-ctlz-cttz-ctpop.ll @@ -577,21 +577,21 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s3, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: not a1, s4 +; RV32I-NEXT: not a1, s0 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s5, a2, 1365 -; RV32I-NEXT: and a1, a1, s5 +; RV32I-NEXT: addi s4, a2, 1365 +; RV32I-NEXT: and a1, a1, s4 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s0, a1, 819 -; RV32I-NEXT: and a1, a0, s0 +; RV32I-NEXT: addi s5, a1, 819 +; RV32I-NEXT: and a1, a0, s5 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s0 +; RV32I-NEXT: and a0, a0, s5 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 @@ -599,26 +599,26 @@ define i64 @test_cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: addi s6, a1, -241 ; RV32I-NEXT: and a0, a0, s6 ; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s1, a1, 257 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: addi s3, a1, 257 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: addi a0, s3, -1 -; RV32I-NEXT: not a1, s3 +; RV32I-NEXT: addi a0, s1, -1 +; RV32I-NEXT: not a1, s1 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s5 +; RV32I-NEXT: and a1, a1, s4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s0 +; RV32I-NEXT: and a1, a0, s5 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s0 +; RV32I-NEXT: and a0, a0, s5 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3@plt -; RV32I-NEXT: bnez s4, .LBB7_2 +; RV32I-NEXT: bnez s0, .LBB7_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: addi a0, a0, 32 @@ -976,21 +976,21 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s3, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: not a1, s4 +; RV32I-NEXT: not a1, s0 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s5, a2, 1365 -; RV32I-NEXT: and a1, a1, s5 +; RV32I-NEXT: addi s4, a2, 1365 +; RV32I-NEXT: and a1, a1, s4 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s0, a1, 819 -; RV32I-NEXT: and a1, a0, s0 +; RV32I-NEXT: addi s5, a1, 819 +; RV32I-NEXT: and a1, a0, s5 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s0 +; RV32I-NEXT: and a0, a0, s5 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 @@ -998,26 +998,26 @@ define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind { ; RV32I-NEXT: addi s6, a1, -241 ; RV32I-NEXT: and a0, a0, s6 ; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s1, a1, 257 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: addi s3, a1, 257 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: addi a0, s3, -1 -; RV32I-NEXT: not a1, s3 +; RV32I-NEXT: addi a0, s1, -1 +; RV32I-NEXT: not a1, s1 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s5 +; RV32I-NEXT: and a1, a1, s4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s0 +; RV32I-NEXT: and a1, a0, s5 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s0 +; RV32I-NEXT: and a0, a0, s5 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3@plt -; RV32I-NEXT: bnez s4, .LBB11_2 +; RV32I-NEXT: bnez s0, .LBB11_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: addi a0, a0, 32 @@ -1182,17 +1182,17 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: srli a0, a1, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s3, a2, 1365 -; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: addi s2, a2, 1365 +; RV32I-NEXT: and a0, a0, s2 ; RV32I-NEXT: sub a0, a1, a0 ; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s0, a1, 819 -; RV32I-NEXT: and a1, a0, s0 +; RV32I-NEXT: addi s1, a1, 819 +; RV32I-NEXT: and a1, a0, s1 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s0 +; RV32I-NEXT: and a0, a0, s1 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 @@ -1200,21 +1200,21 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; RV32I-NEXT: addi s4, a1, -241 ; RV32I-NEXT: and a0, a0, s4 ; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s1, a1, 257 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: addi s3, a1, 257 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: srli s5, a0, 24 -; RV32I-NEXT: srli a0, s2, 1 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: sub a0, s2, a0 -; RV32I-NEXT: and a1, a0, s0 +; RV32I-NEXT: srli a0, s0, 1 +; RV32I-NEXT: and a0, a0, s2 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: and a1, a0, s1 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s0 +; RV32I-NEXT: and a0, a0, s1 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: add a0, a0, s5 diff --git a/llvm/test/CodeGen/RISCV/rv32i-rv64i-half.ll b/llvm/test/CodeGen/RISCV/rv32i-rv64i-half.ll index 7f7cf044ec280..32cae6fc1df4a 100644 --- a/llvm/test/CodeGen/RISCV/rv32i-rv64i-half.ll +++ b/llvm/test/CodeGen/RISCV/rv32i-rv64i-half.ll @@ -19,18 +19,18 @@ define half @half_test(half %a, half %b) nounwind { ; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: lui a1, 16 -; RV32I-NEXT: addi s1, a1, -1 -; RV32I-NEXT: and a0, a0, s1 +; RV32I-NEXT: addi s2, a1, -1 +; RV32I-NEXT: and a0, a0, s2 ; RV32I-NEXT: call __gnu_h2f_ieee@plt -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s0, s1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: and a0, s0, s2 ; RV32I-NEXT: call __gnu_h2f_ieee@plt ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: call __addsf3@plt ; RV32I-NEXT: call __gnu_f2h_ieee@plt -; RV32I-NEXT: and a0, a0, s1 +; RV32I-NEXT: and a0, a0, s2 ; RV32I-NEXT: call __gnu_h2f_ieee@plt ; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: call __divsf3@plt diff --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll index e30edbd5a388a..033a65d484b8a 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbp.ll @@ -218,38 +218,38 @@ declare i64 @llvm.fshl.i64(i64, i64, i64) define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: rol_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: mv a7, a1 +; RV32I-NEXT: mv t1, a1 ; RV32I-NEXT: andi a1, a2, 63 -; RV32I-NEXT: addi t0, a1, -32 +; RV32I-NEXT: addi a7, a1, -32 ; RV32I-NEXT: addi a6, zero, 31 -; RV32I-NEXT: bltz t0, .LBB7_2 +; RV32I-NEXT: bltz a7, .LBB7_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: sll a1, a0, t0 +; RV32I-NEXT: sll a1, a0, a7 ; RV32I-NEXT: j .LBB7_3 ; RV32I-NEXT: .LBB7_2: -; RV32I-NEXT: sll a3, a7, a2 +; RV32I-NEXT: sll a4, t1, a2 ; RV32I-NEXT: sub a1, a6, a1 -; RV32I-NEXT: srli a4, a0, 1 -; RV32I-NEXT: srl a1, a4, a1 -; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: srli a5, a0, 1 +; RV32I-NEXT: srl a1, a5, a1 +; RV32I-NEXT: or a1, a4, a1 ; RV32I-NEXT: .LBB7_3: ; RV32I-NEXT: neg a5, a2 -; RV32I-NEXT: andi a3, a5, 63 -; RV32I-NEXT: addi a4, a3, -32 -; RV32I-NEXT: bltz a4, .LBB7_5 +; RV32I-NEXT: andi a4, a5, 63 +; RV32I-NEXT: addi t0, a4, -32 +; RV32I-NEXT: bltz t0, .LBB7_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: srl a3, a7, a4 -; RV32I-NEXT: bltz t0, .LBB7_6 +; RV32I-NEXT: srl a3, t1, t0 +; RV32I-NEXT: bltz a7, .LBB7_6 ; RV32I-NEXT: j .LBB7_7 ; RV32I-NEXT: .LBB7_5: -; RV32I-NEXT: srl a4, a7, a5 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: srl a4, a0, a5 -; RV32I-NEXT: sub a3, a6, a3 -; RV32I-NEXT: slli a5, a7, 1 -; RV32I-NEXT: sll a3, a5, a3 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: bgez t0, .LBB7_7 +; RV32I-NEXT: srl a3, t1, a5 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: srl a3, a0, a5 +; RV32I-NEXT: sub a4, a6, a4 +; RV32I-NEXT: slli a5, t1, 1 +; RV32I-NEXT: sll a4, a5, a4 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: bgez a7, .LBB7_7 ; RV32I-NEXT: .LBB7_6: ; RV32I-NEXT: sll a0, a0, a2 ; RV32I-NEXT: or a3, a3, a0 @@ -257,122 +257,122 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: ret ; -; RV32B-LABEL: rol_i64: -; RV32B: # %bb.0: -; RV32B-NEXT: sll a7, a1, a2 -; RV32B-NEXT: andi a4, a2, 63 -; RV32B-NEXT: addi a6, zero, 31 -; RV32B-NEXT: sub a5, a6, a4 -; RV32B-NEXT: srli a3, a0, 1 -; RV32B-NEXT: srl a3, a3, a5 -; RV32B-NEXT: or a7, a7, a3 -; RV32B-NEXT: addi t1, a4, -32 -; RV32B-NEXT: sll a5, a0, t1 -; RV32B-NEXT: slti a3, t1, 0 -; RV32B-NEXT: cmov a7, a3, a7, a5 -; RV32B-NEXT: neg a5, a2 -; RV32B-NEXT: srl t0, a1, a5 -; RV32B-NEXT: andi t2, a5, 63 -; RV32B-NEXT: addi a4, t2, -32 -; RV32B-NEXT: srai a3, a4, 31 -; RV32B-NEXT: and a3, a3, t0 -; RV32B-NEXT: or a7, a7, a3 -; RV32B-NEXT: srl t0, a0, a5 -; RV32B-NEXT: sub a5, a6, t2 -; RV32B-NEXT: slli a3, a1, 1 -; RV32B-NEXT: sll a3, a3, a5 -; RV32B-NEXT: or a3, t0, a3 -; RV32B-NEXT: srl a1, a1, a4 -; RV32B-NEXT: slti a4, a4, 0 -; RV32B-NEXT: cmov a1, a4, a3, a1 -; RV32B-NEXT: sll a0, a0, a2 -; RV32B-NEXT: srai a2, t1, 31 -; RV32B-NEXT: and a0, a2, a0 -; RV32B-NEXT: or a0, a0, a1 -; RV32B-NEXT: mv a1, a7 -; RV32B-NEXT: ret -; -; RV32ZBB-LABEL: rol_i64: -; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: mv a7, a1 -; RV32ZBB-NEXT: andi a1, a2, 63 -; RV32ZBB-NEXT: addi t0, a1, -32 -; RV32ZBB-NEXT: addi a6, zero, 31 -; RV32ZBB-NEXT: bltz t0, .LBB7_2 -; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: sll a1, a0, t0 -; RV32ZBB-NEXT: j .LBB7_3 -; RV32ZBB-NEXT: .LBB7_2: -; RV32ZBB-NEXT: sll a3, a7, a2 -; RV32ZBB-NEXT: sub a1, a6, a1 -; RV32ZBB-NEXT: srli a4, a0, 1 -; RV32ZBB-NEXT: srl a1, a4, a1 -; RV32ZBB-NEXT: or a1, a3, a1 -; RV32ZBB-NEXT: .LBB7_3: -; RV32ZBB-NEXT: neg a5, a2 -; RV32ZBB-NEXT: andi a3, a5, 63 -; RV32ZBB-NEXT: addi a4, a3, -32 -; RV32ZBB-NEXT: bltz a4, .LBB7_5 -; RV32ZBB-NEXT: # %bb.4: -; RV32ZBB-NEXT: srl a3, a7, a4 -; RV32ZBB-NEXT: bltz t0, .LBB7_6 -; RV32ZBB-NEXT: j .LBB7_7 -; RV32ZBB-NEXT: .LBB7_5: -; RV32ZBB-NEXT: srl a4, a7, a5 -; RV32ZBB-NEXT: or a1, a1, a4 -; RV32ZBB-NEXT: srl a4, a0, a5 -; RV32ZBB-NEXT: sub a3, a6, a3 -; RV32ZBB-NEXT: slli a5, a7, 1 -; RV32ZBB-NEXT: sll a3, a5, a3 -; RV32ZBB-NEXT: or a3, a4, a3 -; RV32ZBB-NEXT: bgez t0, .LBB7_7 -; RV32ZBB-NEXT: .LBB7_6: -; RV32ZBB-NEXT: sll a0, a0, a2 -; RV32ZBB-NEXT: or a3, a3, a0 -; RV32ZBB-NEXT: .LBB7_7: -; RV32ZBB-NEXT: mv a0, a3 -; RV32ZBB-NEXT: ret -; -; RV32ZBP-LABEL: rol_i64: -; RV32ZBP: # %bb.0: -; RV32ZBP-NEXT: mv a7, a1 -; RV32ZBP-NEXT: andi a1, a2, 63 -; RV32ZBP-NEXT: addi t0, a1, -32 -; RV32ZBP-NEXT: addi a6, zero, 31 -; RV32ZBP-NEXT: bltz t0, .LBB7_2 -; RV32ZBP-NEXT: # %bb.1: -; RV32ZBP-NEXT: sll a1, a0, t0 -; RV32ZBP-NEXT: j .LBB7_3 -; RV32ZBP-NEXT: .LBB7_2: -; RV32ZBP-NEXT: sll a3, a7, a2 -; RV32ZBP-NEXT: sub a1, a6, a1 -; RV32ZBP-NEXT: srli a4, a0, 1 -; RV32ZBP-NEXT: srl a1, a4, a1 -; RV32ZBP-NEXT: or a1, a3, a1 -; RV32ZBP-NEXT: .LBB7_3: -; RV32ZBP-NEXT: neg a5, a2 -; RV32ZBP-NEXT: andi a3, a5, 63 -; RV32ZBP-NEXT: addi a4, a3, -32 -; RV32ZBP-NEXT: bltz a4, .LBB7_5 -; RV32ZBP-NEXT: # %bb.4: -; RV32ZBP-NEXT: srl a3, a7, a4 -; RV32ZBP-NEXT: bltz t0, .LBB7_6 -; RV32ZBP-NEXT: j .LBB7_7 -; RV32ZBP-NEXT: .LBB7_5: -; RV32ZBP-NEXT: srl a4, a7, a5 -; RV32ZBP-NEXT: or a1, a1, a4 -; RV32ZBP-NEXT: srl a4, a0, a5 -; RV32ZBP-NEXT: sub a3, a6, a3 -; RV32ZBP-NEXT: slli a5, a7, 1 -; RV32ZBP-NEXT: sll a3, a5, a3 -; RV32ZBP-NEXT: or a3, a4, a3 -; RV32ZBP-NEXT: bgez t0, .LBB7_7 -; RV32ZBP-NEXT: .LBB7_6: -; RV32ZBP-NEXT: sll a0, a0, a2 -; RV32ZBP-NEXT: or a3, a3, a0 -; RV32ZBP-NEXT: .LBB7_7: -; RV32ZBP-NEXT: mv a0, a3 -; RV32ZBP-NEXT: ret +; RV32IB-LABEL: rol_i64: +; RV32IB: # %bb.0: +; RV32IB-NEXT: sll a7, a1, a2 +; RV32IB-NEXT: andi a4, a2, 63 +; RV32IB-NEXT: addi a6, zero, 31 +; RV32IB-NEXT: sub a5, a6, a4 +; RV32IB-NEXT: srli a3, a0, 1 +; RV32IB-NEXT: srl a3, a3, a5 +; RV32IB-NEXT: or a3, a7, a3 +; RV32IB-NEXT: addi a7, a4, -32 +; RV32IB-NEXT: sll a5, a0, a7 +; RV32IB-NEXT: slti a4, a7, 0 +; RV32IB-NEXT: cmov t0, a4, a3, a5 +; RV32IB-NEXT: neg a4, a2 +; RV32IB-NEXT: srl t2, a1, a4 +; RV32IB-NEXT: andi a3, a4, 63 +; RV32IB-NEXT: addi t1, a3, -32 +; RV32IB-NEXT: srai a5, t1, 31 +; RV32IB-NEXT: and a5, a5, t2 +; RV32IB-NEXT: or t0, t0, a5 +; RV32IB-NEXT: srl a4, a0, a4 +; RV32IB-NEXT: sub a3, a6, a3 +; RV32IB-NEXT: slli a5, a1, 1 +; RV32IB-NEXT: sll a3, a5, a3 +; RV32IB-NEXT: or a3, a4, a3 +; RV32IB-NEXT: srl a1, a1, t1 +; RV32IB-NEXT: slti a4, t1, 0 +; RV32IB-NEXT: cmov a1, a4, a3, a1 +; RV32IB-NEXT: sll a0, a0, a2 +; RV32IB-NEXT: srai a2, a7, 31 +; RV32IB-NEXT: and a0, a2, a0 +; RV32IB-NEXT: or a0, a0, a1 +; RV32IB-NEXT: mv a1, t0 +; RV32IB-NEXT: ret +; +; RV32IBB-LABEL: rol_i64: +; RV32IBB: # %bb.0: +; RV32IBB-NEXT: mv t1, a1 +; RV32IBB-NEXT: andi a1, a2, 63 +; RV32IBB-NEXT: addi a7, a1, -32 +; RV32IBB-NEXT: addi a6, zero, 31 +; RV32IBB-NEXT: bltz a7, .LBB7_2 +; RV32IBB-NEXT: # %bb.1: +; RV32IBB-NEXT: sll a1, a0, a7 +; RV32IBB-NEXT: j .LBB7_3 +; RV32IBB-NEXT: .LBB7_2: +; RV32IBB-NEXT: sll a4, t1, a2 +; RV32IBB-NEXT: sub a1, a6, a1 +; RV32IBB-NEXT: srli a5, a0, 1 +; RV32IBB-NEXT: srl a1, a5, a1 +; RV32IBB-NEXT: or a1, a4, a1 +; RV32IBB-NEXT: .LBB7_3: +; RV32IBB-NEXT: neg a5, a2 +; RV32IBB-NEXT: andi a4, a5, 63 +; RV32IBB-NEXT: addi t0, a4, -32 +; RV32IBB-NEXT: bltz t0, .LBB7_5 +; RV32IBB-NEXT: # %bb.4: +; RV32IBB-NEXT: srl a3, t1, t0 +; RV32IBB-NEXT: bltz a7, .LBB7_6 +; RV32IBB-NEXT: j .LBB7_7 +; RV32IBB-NEXT: .LBB7_5: +; RV32IBB-NEXT: srl a3, t1, a5 +; RV32IBB-NEXT: or a1, a1, a3 +; RV32IBB-NEXT: srl a3, a0, a5 +; RV32IBB-NEXT: sub a4, a6, a4 +; RV32IBB-NEXT: slli a5, t1, 1 +; RV32IBB-NEXT: sll a4, a5, a4 +; RV32IBB-NEXT: or a3, a3, a4 +; RV32IBB-NEXT: bgez a7, .LBB7_7 +; RV32IBB-NEXT: .LBB7_6: +; RV32IBB-NEXT: sll a0, a0, a2 +; RV32IBB-NEXT: or a3, a3, a0 +; RV32IBB-NEXT: .LBB7_7: +; RV32IBB-NEXT: mv a0, a3 +; RV32IBB-NEXT: ret +; +; RV32IBP-LABEL: rol_i64: +; RV32IBP: # %bb.0: +; RV32IBP-NEXT: mv t1, a1 +; RV32IBP-NEXT: andi a1, a2, 63 +; RV32IBP-NEXT: addi a7, a1, -32 +; RV32IBP-NEXT: addi a6, zero, 31 +; RV32IBP-NEXT: bltz a7, .LBB7_2 +; RV32IBP-NEXT: # %bb.1: +; RV32IBP-NEXT: sll a1, a0, a7 +; RV32IBP-NEXT: j .LBB7_3 +; RV32IBP-NEXT: .LBB7_2: +; RV32IBP-NEXT: sll a4, t1, a2 +; RV32IBP-NEXT: sub a1, a6, a1 +; RV32IBP-NEXT: srli a5, a0, 1 +; RV32IBP-NEXT: srl a1, a5, a1 +; RV32IBP-NEXT: or a1, a4, a1 +; RV32IBP-NEXT: .LBB7_3: +; RV32IBP-NEXT: neg a5, a2 +; RV32IBP-NEXT: andi a4, a5, 63 +; RV32IBP-NEXT: addi t0, a4, -32 +; RV32IBP-NEXT: bltz t0, .LBB7_5 +; RV32IBP-NEXT: # %bb.4: +; RV32IBP-NEXT: srl a3, t1, t0 +; RV32IBP-NEXT: bltz a7, .LBB7_6 +; RV32IBP-NEXT: j .LBB7_7 +; RV32IBP-NEXT: .LBB7_5: +; RV32IBP-NEXT: srl a3, t1, a5 +; RV32IBP-NEXT: or a1, a1, a3 +; RV32IBP-NEXT: srl a3, a0, a5 +; RV32IBP-NEXT: sub a4, a6, a4 +; RV32IBP-NEXT: slli a5, t1, 1 +; RV32IBP-NEXT: sll a4, a5, a4 +; RV32IBP-NEXT: or a3, a3, a4 +; RV32IBP-NEXT: bgez a7, .LBB7_7 +; RV32IBP-NEXT: .LBB7_6: +; RV32IBP-NEXT: sll a0, a0, a2 +; RV32IBP-NEXT: or a3, a3, a0 +; RV32IBP-NEXT: .LBB7_7: +; RV32IBP-NEXT: mv a0, a3 +; RV32IBP-NEXT: ret %or = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b) ret i64 %or } @@ -416,7 +416,7 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: ror_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: mv t0, a0 +; RV32I-NEXT: mv t1, a0 ; RV32I-NEXT: andi a0, a2, 63 ; RV32I-NEXT: addi a7, a0, -32 ; RV32I-NEXT: addi a6, zero, 31 @@ -425,26 +425,26 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: srl a0, a1, a7 ; RV32I-NEXT: j .LBB9_3 ; RV32I-NEXT: .LBB9_2: -; RV32I-NEXT: srl a3, t0, a2 +; RV32I-NEXT: srl a4, t1, a2 ; RV32I-NEXT: sub a0, a6, a0 -; RV32I-NEXT: slli a4, a1, 1 -; RV32I-NEXT: sll a0, a4, a0 -; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: slli a5, a1, 1 +; RV32I-NEXT: sll a0, a5, a0 +; RV32I-NEXT: or a0, a4, a0 ; RV32I-NEXT: .LBB9_3: ; RV32I-NEXT: neg a5, a2 ; RV32I-NEXT: andi a4, a5, 63 -; RV32I-NEXT: addi a3, a4, -32 -; RV32I-NEXT: bltz a3, .LBB9_5 +; RV32I-NEXT: addi t0, a4, -32 +; RV32I-NEXT: bltz t0, .LBB9_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: sll a3, t0, a3 +; RV32I-NEXT: sll a3, t1, t0 ; RV32I-NEXT: bltz a7, .LBB9_6 ; RV32I-NEXT: j .LBB9_7 ; RV32I-NEXT: .LBB9_5: -; RV32I-NEXT: sll a3, t0, a5 +; RV32I-NEXT: sll a3, t1, a5 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: sll a3, a1, a5 ; RV32I-NEXT: sub a4, a6, a4 -; RV32I-NEXT: srli a5, t0, 1 +; RV32I-NEXT: srli a5, t1, 1 ; RV32I-NEXT: srl a4, a5, a4 ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: bgez a7, .LBB9_7 @@ -455,122 +455,122 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: mv a1, a3 ; RV32I-NEXT: ret ; -; RV32B-LABEL: ror_i64: -; RV32B: # %bb.0: -; RV32B-NEXT: srl a7, a0, a2 -; RV32B-NEXT: andi a4, a2, 63 -; RV32B-NEXT: addi a6, zero, 31 -; RV32B-NEXT: sub a5, a6, a4 -; RV32B-NEXT: slli a3, a1, 1 -; RV32B-NEXT: sll a3, a3, a5 -; RV32B-NEXT: or a7, a7, a3 -; RV32B-NEXT: addi t1, a4, -32 -; RV32B-NEXT: srl a5, a1, t1 -; RV32B-NEXT: slti a3, t1, 0 -; RV32B-NEXT: cmov a7, a3, a7, a5 -; RV32B-NEXT: neg a5, a2 -; RV32B-NEXT: sll t0, a0, a5 -; RV32B-NEXT: andi t2, a5, 63 -; RV32B-NEXT: addi a4, t2, -32 -; RV32B-NEXT: srai a3, a4, 31 -; RV32B-NEXT: and a3, a3, t0 -; RV32B-NEXT: or a7, a7, a3 -; RV32B-NEXT: sll t0, a1, a5 -; RV32B-NEXT: sub a5, a6, t2 -; RV32B-NEXT: srli a3, a0, 1 -; RV32B-NEXT: srl a3, a3, a5 -; RV32B-NEXT: or a3, t0, a3 -; RV32B-NEXT: sll a0, a0, a4 -; RV32B-NEXT: slti a4, a4, 0 -; RV32B-NEXT: cmov a0, a4, a3, a0 -; RV32B-NEXT: srl a1, a1, a2 -; RV32B-NEXT: srai a2, t1, 31 -; RV32B-NEXT: and a1, a2, a1 -; RV32B-NEXT: or a1, a1, a0 -; RV32B-NEXT: mv a0, a7 -; RV32B-NEXT: ret -; -; RV32ZBB-LABEL: ror_i64: -; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: mv t0, a0 -; RV32ZBB-NEXT: andi a0, a2, 63 -; RV32ZBB-NEXT: addi a7, a0, -32 -; RV32ZBB-NEXT: addi a6, zero, 31 -; RV32ZBB-NEXT: bltz a7, .LBB9_2 -; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: srl a0, a1, a7 -; RV32ZBB-NEXT: j .LBB9_3 -; RV32ZBB-NEXT: .LBB9_2: -; RV32ZBB-NEXT: srl a3, t0, a2 -; RV32ZBB-NEXT: sub a0, a6, a0 -; RV32ZBB-NEXT: slli a4, a1, 1 -; RV32ZBB-NEXT: sll a0, a4, a0 -; RV32ZBB-NEXT: or a0, a3, a0 -; RV32ZBB-NEXT: .LBB9_3: -; RV32ZBB-NEXT: neg a5, a2 -; RV32ZBB-NEXT: andi a4, a5, 63 -; RV32ZBB-NEXT: addi a3, a4, -32 -; RV32ZBB-NEXT: bltz a3, .LBB9_5 -; RV32ZBB-NEXT: # %bb.4: -; RV32ZBB-NEXT: sll a3, t0, a3 -; RV32ZBB-NEXT: bltz a7, .LBB9_6 -; RV32ZBB-NEXT: j .LBB9_7 -; RV32ZBB-NEXT: .LBB9_5: -; RV32ZBB-NEXT: sll a3, t0, a5 -; RV32ZBB-NEXT: or a0, a0, a3 -; RV32ZBB-NEXT: sll a3, a1, a5 -; RV32ZBB-NEXT: sub a4, a6, a4 -; RV32ZBB-NEXT: srli a5, t0, 1 -; RV32ZBB-NEXT: srl a4, a5, a4 -; RV32ZBB-NEXT: or a3, a3, a4 -; RV32ZBB-NEXT: bgez a7, .LBB9_7 -; RV32ZBB-NEXT: .LBB9_6: -; RV32ZBB-NEXT: srl a1, a1, a2 -; RV32ZBB-NEXT: or a3, a3, a1 -; RV32ZBB-NEXT: .LBB9_7: -; RV32ZBB-NEXT: mv a1, a3 -; RV32ZBB-NEXT: ret -; -; RV32ZBP-LABEL: ror_i64: -; RV32ZBP: # %bb.0: -; RV32ZBP-NEXT: mv t0, a0 -; RV32ZBP-NEXT: andi a0, a2, 63 -; RV32ZBP-NEXT: addi a7, a0, -32 -; RV32ZBP-NEXT: addi a6, zero, 31 -; RV32ZBP-NEXT: bltz a7, .LBB9_2 -; RV32ZBP-NEXT: # %bb.1: -; RV32ZBP-NEXT: srl a0, a1, a7 -; RV32ZBP-NEXT: j .LBB9_3 -; RV32ZBP-NEXT: .LBB9_2: -; RV32ZBP-NEXT: srl a3, t0, a2 -; RV32ZBP-NEXT: sub a0, a6, a0 -; RV32ZBP-NEXT: slli a4, a1, 1 -; RV32ZBP-NEXT: sll a0, a4, a0 -; RV32ZBP-NEXT: or a0, a3, a0 -; RV32ZBP-NEXT: .LBB9_3: -; RV32ZBP-NEXT: neg a5, a2 -; RV32ZBP-NEXT: andi a4, a5, 63 -; RV32ZBP-NEXT: addi a3, a4, -32 -; RV32ZBP-NEXT: bltz a3, .LBB9_5 -; RV32ZBP-NEXT: # %bb.4: -; RV32ZBP-NEXT: sll a3, t0, a3 -; RV32ZBP-NEXT: bltz a7, .LBB9_6 -; RV32ZBP-NEXT: j .LBB9_7 -; RV32ZBP-NEXT: .LBB9_5: -; RV32ZBP-NEXT: sll a3, t0, a5 -; RV32ZBP-NEXT: or a0, a0, a3 -; RV32ZBP-NEXT: sll a3, a1, a5 -; RV32ZBP-NEXT: sub a4, a6, a4 -; RV32ZBP-NEXT: srli a5, t0, 1 -; RV32ZBP-NEXT: srl a4, a5, a4 -; RV32ZBP-NEXT: or a3, a3, a4 -; RV32ZBP-NEXT: bgez a7, .LBB9_7 -; RV32ZBP-NEXT: .LBB9_6: -; RV32ZBP-NEXT: srl a1, a1, a2 -; RV32ZBP-NEXT: or a3, a3, a1 -; RV32ZBP-NEXT: .LBB9_7: -; RV32ZBP-NEXT: mv a1, a3 -; RV32ZBP-NEXT: ret +; RV32IB-LABEL: ror_i64: +; RV32IB: # %bb.0: +; RV32IB-NEXT: srl a7, a0, a2 +; RV32IB-NEXT: andi a4, a2, 63 +; RV32IB-NEXT: addi a6, zero, 31 +; RV32IB-NEXT: sub a5, a6, a4 +; RV32IB-NEXT: slli a3, a1, 1 +; RV32IB-NEXT: sll a3, a3, a5 +; RV32IB-NEXT: or a3, a7, a3 +; RV32IB-NEXT: addi a7, a4, -32 +; RV32IB-NEXT: srl a5, a1, a7 +; RV32IB-NEXT: slti a4, a7, 0 +; RV32IB-NEXT: cmov t0, a4, a3, a5 +; RV32IB-NEXT: neg a4, a2 +; RV32IB-NEXT: sll t2, a0, a4 +; RV32IB-NEXT: andi a3, a4, 63 +; RV32IB-NEXT: addi t1, a3, -32 +; RV32IB-NEXT: srai a5, t1, 31 +; RV32IB-NEXT: and a5, a5, t2 +; RV32IB-NEXT: or t0, t0, a5 +; RV32IB-NEXT: sll a4, a1, a4 +; RV32IB-NEXT: sub a3, a6, a3 +; RV32IB-NEXT: srli a5, a0, 1 +; RV32IB-NEXT: srl a3, a5, a3 +; RV32IB-NEXT: or a3, a4, a3 +; RV32IB-NEXT: sll a0, a0, t1 +; RV32IB-NEXT: slti a4, t1, 0 +; RV32IB-NEXT: cmov a0, a4, a3, a0 +; RV32IB-NEXT: srl a1, a1, a2 +; RV32IB-NEXT: srai a2, a7, 31 +; RV32IB-NEXT: and a1, a2, a1 +; RV32IB-NEXT: or a1, a1, a0 +; RV32IB-NEXT: mv a0, t0 +; RV32IB-NEXT: ret +; +; RV32IBB-LABEL: ror_i64: +; RV32IBB: # %bb.0: +; RV32IBB-NEXT: mv t1, a0 +; RV32IBB-NEXT: andi a0, a2, 63 +; RV32IBB-NEXT: addi a7, a0, -32 +; RV32IBB-NEXT: addi a6, zero, 31 +; RV32IBB-NEXT: bltz a7, .LBB9_2 +; RV32IBB-NEXT: # %bb.1: +; RV32IBB-NEXT: srl a0, a1, a7 +; RV32IBB-NEXT: j .LBB9_3 +; RV32IBB-NEXT: .LBB9_2: +; RV32IBB-NEXT: srl a4, t1, a2 +; RV32IBB-NEXT: sub a0, a6, a0 +; RV32IBB-NEXT: slli a5, a1, 1 +; RV32IBB-NEXT: sll a0, a5, a0 +; RV32IBB-NEXT: or a0, a4, a0 +; RV32IBB-NEXT: .LBB9_3: +; RV32IBB-NEXT: neg a5, a2 +; RV32IBB-NEXT: andi a4, a5, 63 +; RV32IBB-NEXT: addi t0, a4, -32 +; RV32IBB-NEXT: bltz t0, .LBB9_5 +; RV32IBB-NEXT: # %bb.4: +; RV32IBB-NEXT: sll a3, t1, t0 +; RV32IBB-NEXT: bltz a7, .LBB9_6 +; RV32IBB-NEXT: j .LBB9_7 +; RV32IBB-NEXT: .LBB9_5: +; RV32IBB-NEXT: sll a3, t1, a5 +; RV32IBB-NEXT: or a0, a0, a3 +; RV32IBB-NEXT: sll a3, a1, a5 +; RV32IBB-NEXT: sub a4, a6, a4 +; RV32IBB-NEXT: srli a5, t1, 1 +; RV32IBB-NEXT: srl a4, a5, a4 +; RV32IBB-NEXT: or a3, a3, a4 +; RV32IBB-NEXT: bgez a7, .LBB9_7 +; RV32IBB-NEXT: .LBB9_6: +; RV32IBB-NEXT: srl a1, a1, a2 +; RV32IBB-NEXT: or a3, a3, a1 +; RV32IBB-NEXT: .LBB9_7: +; RV32IBB-NEXT: mv a1, a3 +; RV32IBB-NEXT: ret +; +; RV32IBP-LABEL: ror_i64: +; RV32IBP: # %bb.0: +; RV32IBP-NEXT: mv t1, a0 +; RV32IBP-NEXT: andi a0, a2, 63 +; RV32IBP-NEXT: addi a7, a0, -32 +; RV32IBP-NEXT: addi a6, zero, 31 +; RV32IBP-NEXT: bltz a7, .LBB9_2 +; RV32IBP-NEXT: # %bb.1: +; RV32IBP-NEXT: srl a0, a1, a7 +; RV32IBP-NEXT: j .LBB9_3 +; RV32IBP-NEXT: .LBB9_2: +; RV32IBP-NEXT: srl a4, t1, a2 +; RV32IBP-NEXT: sub a0, a6, a0 +; RV32IBP-NEXT: slli a5, a1, 1 +; RV32IBP-NEXT: sll a0, a5, a0 +; RV32IBP-NEXT: or a0, a4, a0 +; RV32IBP-NEXT: .LBB9_3: +; RV32IBP-NEXT: neg a5, a2 +; RV32IBP-NEXT: andi a4, a5, 63 +; RV32IBP-NEXT: addi t0, a4, -32 +; RV32IBP-NEXT: bltz t0, .LBB9_5 +; RV32IBP-NEXT: # %bb.4: +; RV32IBP-NEXT: sll a3, t1, t0 +; RV32IBP-NEXT: bltz a7, .LBB9_6 +; RV32IBP-NEXT: j .LBB9_7 +; RV32IBP-NEXT: .LBB9_5: +; RV32IBP-NEXT: sll a3, t1, a5 +; RV32IBP-NEXT: or a0, a0, a3 +; RV32IBP-NEXT: sll a3, a1, a5 +; RV32IBP-NEXT: sub a4, a6, a4 +; RV32IBP-NEXT: srli a5, t1, 1 +; RV32IBP-NEXT: srl a4, a5, a4 +; RV32IBP-NEXT: or a3, a3, a4 +; RV32IBP-NEXT: bgez a7, .LBB9_7 +; RV32IBP-NEXT: .LBB9_6: +; RV32IBP-NEXT: srl a1, a1, a2 +; RV32IBP-NEXT: or a3, a3, a1 +; RV32IBP-NEXT: .LBB9_7: +; RV32IBP-NEXT: mv a1, a3 +; RV32IBP-NEXT: ret %or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b) ret i64 %or } diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index 5380db99301ec..cd43db48f41b1 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -81,8 +81,8 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s3, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: srli a0, a1, 1 ; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 @@ -96,14 +96,14 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s5, a2, 1365 -; RV32I-NEXT: and a1, a1, s5 +; RV32I-NEXT: addi s4, a2, 1365 +; RV32I-NEXT: and a1, a1, s4 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s1, a1, 819 -; RV32I-NEXT: and a1, a0, s1 +; RV32I-NEXT: addi s5, a1, 819 +; RV32I-NEXT: and a1, a0, s5 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s1 +; RV32I-NEXT: and a0, a0, s5 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 @@ -111,12 +111,12 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: addi s6, a1, -241 ; RV32I-NEXT: and a0, a0, s6 ; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s0, a1, 257 -; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: addi s3, a1, 257 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: srli a0, s4, 1 -; RV32I-NEXT: or a0, s4, a0 +; RV32I-NEXT: srli a0, s1, 1 +; RV32I-NEXT: or a0, s1, a0 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -127,18 +127,18 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s5 +; RV32I-NEXT: and a1, a1, s4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s1 +; RV32I-NEXT: and a1, a0, s5 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s1 +; RV32I-NEXT: and a0, a0, s5 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3@plt -; RV32I-NEXT: bnez s3, .LBB1_2 +; RV32I-NEXT: bnez s0, .LBB1_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: addi a0, a0, 32 @@ -250,21 +250,21 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s3, a1 -; RV32I-NEXT: mv s4, a0 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: addi a0, a0, -1 -; RV32I-NEXT: not a1, s4 +; RV32I-NEXT: not a1, s0 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s5, a2, 1365 -; RV32I-NEXT: and a1, a1, s5 +; RV32I-NEXT: addi s4, a2, 1365 +; RV32I-NEXT: and a1, a1, s4 ; RV32I-NEXT: sub a0, a0, a1 ; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s0, a1, 819 -; RV32I-NEXT: and a1, a0, s0 +; RV32I-NEXT: addi s5, a1, 819 +; RV32I-NEXT: and a1, a0, s5 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s0 +; RV32I-NEXT: and a0, a0, s5 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 @@ -272,26 +272,26 @@ define i64 @cttz_i64(i64 %a) nounwind { ; RV32I-NEXT: addi s6, a1, -241 ; RV32I-NEXT: and a0, a0, s6 ; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s1, a1, 257 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: addi s3, a1, 257 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: addi a0, s3, -1 -; RV32I-NEXT: not a1, s3 +; RV32I-NEXT: addi a0, s1, -1 +; RV32I-NEXT: not a1, s1 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s5 +; RV32I-NEXT: and a1, a1, s4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s0 +; RV32I-NEXT: and a1, a0, s5 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s0 +; RV32I-NEXT: and a0, a0, s5 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3@plt -; RV32I-NEXT: bnez s4, .LBB3_2 +; RV32I-NEXT: bnez s0, .LBB3_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: addi a0, a0, 32 @@ -393,17 +393,17 @@ define i64 @ctpop_i64(i64 %a) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: srli a0, a1, 1 ; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s3, a2, 1365 -; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: addi s2, a2, 1365 +; RV32I-NEXT: and a0, a0, s2 ; RV32I-NEXT: sub a0, a1, a0 ; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s0, a1, 819 -; RV32I-NEXT: and a1, a0, s0 +; RV32I-NEXT: addi s1, a1, 819 +; RV32I-NEXT: and a1, a0, s1 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s0 +; RV32I-NEXT: and a0, a0, s1 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 @@ -411,21 +411,21 @@ define i64 @ctpop_i64(i64 %a) nounwind { ; RV32I-NEXT: addi s4, a1, -241 ; RV32I-NEXT: and a0, a0, s4 ; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s1, a1, 257 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: addi s3, a1, 257 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: srli s5, a0, 24 -; RV32I-NEXT: srli a0, s2, 1 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: sub a0, s2, a0 -; RV32I-NEXT: and a1, a0, s0 +; RV32I-NEXT: srli a0, s0, 1 +; RV32I-NEXT: and a0, a0, s2 +; RV32I-NEXT: sub a0, s0, a0 +; RV32I-NEXT: and a1, a0, s1 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s0 +; RV32I-NEXT: and a0, a0, s1 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: mv a1, s3 ; RV32I-NEXT: call __mulsi3@plt ; RV32I-NEXT: srli a0, a0, 24 ; RV32I-NEXT: add a0, a0, s5 diff --git a/llvm/test/CodeGen/RISCV/rv32zbp.ll b/llvm/test/CodeGen/RISCV/rv32zbp.ll index 2198259140916..cfad9fb9110a4 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbp.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbp.ll @@ -1067,47 +1067,47 @@ define i64 @gorc3b_i64(i64 %a) nounwind { ; RV32I-NEXT: slli a2, a0, 1 ; RV32I-NEXT: slli a3, a1, 1 ; RV32I-NEXT: lui a4, 699051 -; RV32I-NEXT: addi a4, a4, -1366 -; RV32I-NEXT: and a6, a3, a4 -; RV32I-NEXT: and a7, a2, a4 +; RV32I-NEXT: addi a6, a4, -1366 +; RV32I-NEXT: and a7, a3, a6 +; RV32I-NEXT: and a2, a2, a6 ; RV32I-NEXT: srli a5, a1, 1 -; RV32I-NEXT: srli a3, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi a2, a2, 1365 -; RV32I-NEXT: and a3, a3, a2 -; RV32I-NEXT: and a5, a5, a2 +; RV32I-NEXT: srli a4, a0, 1 +; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: addi t0, a3, 1365 +; RV32I-NEXT: and a4, a4, t0 +; RV32I-NEXT: and a5, a5, t0 ; RV32I-NEXT: or a1, a5, a1 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: or a0, a0, a7 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: slli a6, a1, 2 -; RV32I-NEXT: slli a5, a0, 2 -; RV32I-NEXT: lui a3, 838861 -; RV32I-NEXT: addi a3, a3, -820 -; RV32I-NEXT: and a7, a5, a3 -; RV32I-NEXT: and a6, a6, a3 -; RV32I-NEXT: srli t0, a0, 2 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: slli a2, a1, 2 +; RV32I-NEXT: slli a4, a0, 2 +; RV32I-NEXT: lui a5, 838861 +; RV32I-NEXT: addi a5, a5, -820 +; RV32I-NEXT: and a7, a4, a5 +; RV32I-NEXT: and a2, a2, a5 +; RV32I-NEXT: srli a5, a0, 2 ; RV32I-NEXT: srli a3, a1, 2 -; RV32I-NEXT: lui a5, 209715 -; RV32I-NEXT: addi a5, a5, 819 -; RV32I-NEXT: and a3, a3, a5 -; RV32I-NEXT: and a5, t0, a5 -; RV32I-NEXT: or a0, a5, a0 +; RV32I-NEXT: lui a4, 209715 +; RV32I-NEXT: addi a4, a4, 819 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: and a4, a5, a4 +; RV32I-NEXT: or a0, a4, a0 ; RV32I-NEXT: or a1, a3, a1 -; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: or a0, a0, a7 -; RV32I-NEXT: slli a3, a0, 1 -; RV32I-NEXT: slli a5, a1, 1 -; RV32I-NEXT: and a6, a5, a4 -; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: slli a2, a0, 1 +; RV32I-NEXT: slli a3, a1, 1 +; RV32I-NEXT: and a3, a3, a6 +; RV32I-NEXT: and a2, a2, a6 ; RV32I-NEXT: srli a4, a1, 1 ; RV32I-NEXT: srli a5, a0, 1 -; RV32I-NEXT: and a5, a5, a2 -; RV32I-NEXT: and a2, a4, a2 -; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: and a5, a5, t0 +; RV32I-NEXT: and a4, a4, t0 +; RV32I-NEXT: or a1, a4, a1 ; RV32I-NEXT: or a0, a5, a0 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: ret ; ; RV32B-LABEL: gorc3b_i64: @@ -2049,9 +2049,9 @@ define i64 @grev2b_i64(i64 %a) nounwind { ; RV32I-NEXT: slli a2, a0, 1 ; RV32I-NEXT: slli a3, a1, 1 ; RV32I-NEXT: lui a4, 699051 -; RV32I-NEXT: addi a4, a4, -1366 -; RV32I-NEXT: and a3, a3, a4 -; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: addi a6, a4, -1366 +; RV32I-NEXT: and a3, a3, a6 +; RV32I-NEXT: and a2, a2, a6 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: lui a5, 349525 @@ -2060,24 +2060,24 @@ define i64 @grev2b_i64(i64 %a) nounwind { ; RV32I-NEXT: and a0, a0, a5 ; RV32I-NEXT: or a0, a2, a0 ; RV32I-NEXT: or a1, a3, a1 -; RV32I-NEXT: slli a6, a1, 2 +; RV32I-NEXT: slli a2, a1, 2 ; RV32I-NEXT: slli a3, a0, 2 -; RV32I-NEXT: lui a2, 838861 -; RV32I-NEXT: addi a2, a2, -820 -; RV32I-NEXT: and a7, a3, a2 -; RV32I-NEXT: and a2, a6, a2 +; RV32I-NEXT: lui a4, 838861 +; RV32I-NEXT: addi a4, a4, -820 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: srli a1, a1, 2 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: lui a3, 209715 -; RV32I-NEXT: addi a3, a3, 819 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: and a1, a1, a3 +; RV32I-NEXT: lui a4, 209715 +; RV32I-NEXT: addi a4, a4, 819 +; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: or a1, a2, a1 -; RV32I-NEXT: or a0, a7, a0 +; RV32I-NEXT: or a0, a3, a0 ; RV32I-NEXT: slli a2, a0, 1 ; RV32I-NEXT: slli a3, a1, 1 -; RV32I-NEXT: and a3, a3, a4 -; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: and a3, a3, a6 +; RV32I-NEXT: and a2, a2, a6 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: and a1, a1, a5 @@ -2186,51 +2186,51 @@ define i64 @grev0_i64(i64 %a) nounwind { ; RV32I-NEXT: slli a2, a1, 1 ; RV32I-NEXT: slli a3, a0, 1 ; RV32I-NEXT: lui a4, 699051 -; RV32I-NEXT: addi a4, a4, -1366 -; RV32I-NEXT: and a3, a3, a4 -; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: addi a6, a4, -1366 +; RV32I-NEXT: and a3, a3, a6 +; RV32I-NEXT: and a2, a2, a6 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: lui a5, 349525 -; RV32I-NEXT: addi a5, a5, 1365 -; RV32I-NEXT: and a0, a0, a5 -; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: addi a7, a5, 1365 +; RV32I-NEXT: and a0, a0, a7 +; RV32I-NEXT: and a1, a1, a7 ; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: slli a6, a0, 2 +; RV32I-NEXT: slli a2, a0, 2 ; RV32I-NEXT: slli a3, a1, 2 -; RV32I-NEXT: lui a2, 838861 -; RV32I-NEXT: addi a2, a2, -820 -; RV32I-NEXT: and a7, a3, a2 -; RV32I-NEXT: and a6, a6, a2 +; RV32I-NEXT: lui a4, 838861 +; RV32I-NEXT: addi a4, a4, -820 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: srli a0, a0, 2 ; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: lui a3, 209715 -; RV32I-NEXT: addi a3, a3, 819 -; RV32I-NEXT: and a1, a1, a3 -; RV32I-NEXT: and a0, a0, a3 -; RV32I-NEXT: or t0, a6, a0 -; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: slli a6, a1, 1 -; RV32I-NEXT: slli a0, t0, 1 -; RV32I-NEXT: and a7, a0, a4 -; RV32I-NEXT: and a4, a6, a4 -; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: srli a0, t0, 1 -; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: lui a5, 209715 +; RV32I-NEXT: addi a5, a5, 819 ; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: or a1, a4, a1 -; RV32I-NEXT: or a0, a7, a0 -; RV32I-NEXT: slli a4, a0, 2 -; RV32I-NEXT: slli a5, a1, 2 -; RV32I-NEXT: and a5, a5, a2 -; RV32I-NEXT: and a2, a4, a2 +; RV32I-NEXT: and a0, a0, a5 +; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: slli a2, a1, 1 +; RV32I-NEXT: slli a3, a0, 1 +; RV32I-NEXT: and a3, a3, a6 +; RV32I-NEXT: and a2, a2, a6 +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: and a0, a0, a7 +; RV32I-NEXT: and a1, a1, a7 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: slli a2, a0, 2 +; RV32I-NEXT: slli a3, a1, 2 +; RV32I-NEXT: and a3, a3, a4 +; RV32I-NEXT: and a2, a2, a4 ; RV32I-NEXT: srli a0, a0, 2 ; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: and a1, a1, a3 -; RV32I-NEXT: and a0, a0, a3 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: and a0, a0, a5 ; RV32I-NEXT: or a0, a2, a0 -; RV32I-NEXT: or a1, a5, a1 +; RV32I-NEXT: or a1, a3, a1 ; RV32I-NEXT: ret ; ; RV32B-LABEL: grev0_i64: @@ -2592,68 +2592,68 @@ define i64 @bitreverse_i64(i64 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a2, a1, 8 ; RV32I-NEXT: lui a3, 16 -; RV32I-NEXT: addi t0, a3, -256 -; RV32I-NEXT: and a2, a2, t0 +; RV32I-NEXT: addi a6, a3, -256 +; RV32I-NEXT: and a2, a2, a6 ; RV32I-NEXT: srli a4, a1, 24 ; RV32I-NEXT: or a2, a2, a4 ; RV32I-NEXT: slli a4, a1, 8 -; RV32I-NEXT: lui a6, 4080 -; RV32I-NEXT: and a4, a4, a6 +; RV32I-NEXT: lui a7, 4080 +; RV32I-NEXT: and a4, a4, a7 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: lui a2, 61681 -; RV32I-NEXT: addi t1, a2, -241 -; RV32I-NEXT: and a2, a1, t1 +; RV32I-NEXT: addi t0, a2, -241 +; RV32I-NEXT: and a2, a1, t0 ; RV32I-NEXT: slli a2, a2, 4 -; RV32I-NEXT: lui a5, 986895 -; RV32I-NEXT: addi t2, a5, 240 -; RV32I-NEXT: and a1, a1, t2 +; RV32I-NEXT: lui a3, 986895 +; RV32I-NEXT: addi t1, a3, 240 +; RV32I-NEXT: and a1, a1, t1 ; RV32I-NEXT: srli a1, a1, 4 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: lui a2, 209715 -; RV32I-NEXT: addi t3, a2, 819 -; RV32I-NEXT: and a3, a1, t3 -; RV32I-NEXT: slli a3, a3, 2 +; RV32I-NEXT: addi t2, a2, 819 +; RV32I-NEXT: and a2, a1, t2 +; RV32I-NEXT: slli a2, a2, 2 ; RV32I-NEXT: lui a4, 838861 -; RV32I-NEXT: addi a4, a4, -820 -; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: addi t3, a4, -820 +; RV32I-NEXT: and a1, a1, t3 ; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: lui a3, 349525 -; RV32I-NEXT: addi a3, a3, 1365 -; RV32I-NEXT: and a5, a1, a3 -; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: lui a2, 699051 -; RV32I-NEXT: addi a2, a2, -1366 -; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi a3, a2, 1365 +; RV32I-NEXT: and a2, a1, a3 +; RV32I-NEXT: slli a2, a2, 1 +; RV32I-NEXT: lui a5, 699051 +; RV32I-NEXT: addi a5, a5, -1366 +; RV32I-NEXT: and a1, a1, a5 ; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: or a7, a1, a5 +; RV32I-NEXT: or a2, a1, a2 ; RV32I-NEXT: srli a1, a0, 8 -; RV32I-NEXT: and a1, a1, t0 -; RV32I-NEXT: srli a5, a0, 24 -; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: slli a5, a0, 8 -; RV32I-NEXT: and a5, a5, a6 +; RV32I-NEXT: and a1, a1, a6 +; RV32I-NEXT: srli a4, a0, 24 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: slli a4, a0, 8 +; RV32I-NEXT: and a4, a4, a7 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: and a1, a0, t1 +; RV32I-NEXT: and a1, a0, t0 ; RV32I-NEXT: slli a1, a1, 4 -; RV32I-NEXT: and a0, a0, t2 +; RV32I-NEXT: and a0, a0, t1 ; RV32I-NEXT: srli a0, a0, 4 ; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: and a1, a0, t3 +; RV32I-NEXT: and a1, a0, t2 ; RV32I-NEXT: slli a1, a1, 2 -; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: and a0, a0, t3 ; RV32I-NEXT: srli a0, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: and a1, a0, a3 ; RV32I-NEXT: slli a1, a1, 1 -; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: and a0, a0, a5 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: mv a0, a7 +; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret ; ; RV32B-LABEL: bitreverse_i64: @@ -2813,55 +2813,55 @@ define i64 @bitreverse_bswap_i64(i64 %a) { ; RV32I: # %bb.0: ; RV32I-NEXT: srli a3, a1, 8 ; RV32I-NEXT: lui a2, 16 -; RV32I-NEXT: addi t1, a2, -256 -; RV32I-NEXT: and a3, a3, t1 +; RV32I-NEXT: addi t0, a2, -256 +; RV32I-NEXT: and a3, a3, t0 ; RV32I-NEXT: srli a4, a1, 24 ; RV32I-NEXT: or a4, a3, a4 ; RV32I-NEXT: slli a5, a1, 8 -; RV32I-NEXT: lui a6, 4080 -; RV32I-NEXT: and a5, a5, a6 +; RV32I-NEXT: lui t1, 4080 +; RV32I-NEXT: and a5, a5, t1 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: lui a4, 61681 -; RV32I-NEXT: addi a7, a4, -241 -; RV32I-NEXT: and a5, a1, a7 +; RV32I-NEXT: addi a6, a4, -241 +; RV32I-NEXT: and a5, a1, a6 ; RV32I-NEXT: slli a5, a5, 4 -; RV32I-NEXT: lui a3, 986895 -; RV32I-NEXT: addi t0, a3, 240 -; RV32I-NEXT: and a1, a1, t0 +; RV32I-NEXT: lui a4, 986895 +; RV32I-NEXT: addi a7, a4, 240 +; RV32I-NEXT: and a1, a1, a7 ; RV32I-NEXT: srli a1, a1, 4 ; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: lui a5, 209715 ; RV32I-NEXT: addi t2, a5, 819 ; RV32I-NEXT: and a4, a1, t2 ; RV32I-NEXT: slli a4, a4, 2 -; RV32I-NEXT: lui a3, 838861 -; RV32I-NEXT: addi t3, a3, -820 +; RV32I-NEXT: lui a2, 838861 +; RV32I-NEXT: addi t3, a2, -820 ; RV32I-NEXT: and a1, a1, t3 ; RV32I-NEXT: srli a1, a1, 2 ; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: lui a4, 349525 ; RV32I-NEXT: addi a4, a4, 1365 -; RV32I-NEXT: and a2, a1, a4 -; RV32I-NEXT: slli a2, a2, 1 +; RV32I-NEXT: and a3, a1, a4 +; RV32I-NEXT: slli a3, a3, 1 ; RV32I-NEXT: lui a5, 699051 ; RV32I-NEXT: addi a5, a5, -1366 ; RV32I-NEXT: and a1, a1, a5 ; RV32I-NEXT: srli a1, a1, 1 -; RV32I-NEXT: or a1, a1, a2 -; RV32I-NEXT: srli a2, a0, 8 -; RV32I-NEXT: and a2, a2, t1 -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: or a2, a2, a3 +; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: srli a3, a0, 8 +; RV32I-NEXT: and a3, a3, t0 +; RV32I-NEXT: srli a2, a0, 24 +; RV32I-NEXT: or a2, a3, a2 ; RV32I-NEXT: slli a3, a0, 8 -; RV32I-NEXT: and a3, a3, a6 +; RV32I-NEXT: and a3, a3, t1 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: and a2, a0, a7 +; RV32I-NEXT: and a2, a0, a6 ; RV32I-NEXT: slli a2, a2, 4 -; RV32I-NEXT: and a0, a0, t0 +; RV32I-NEXT: and a0, a0, a7 ; RV32I-NEXT: srli a0, a0, 4 ; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: and a2, a0, t2 @@ -2875,20 +2875,20 @@ define i64 @bitreverse_bswap_i64(i64 %a) { ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: srli a2, a0, 8 -; RV32I-NEXT: and a2, a2, t1 +; RV32I-NEXT: and a2, a2, t0 ; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: or a2, a2, a3 ; RV32I-NEXT: slli a3, a0, 8 -; RV32I-NEXT: and a3, a3, a6 +; RV32I-NEXT: and a3, a3, t1 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: srli a2, a1, 8 -; RV32I-NEXT: and a2, a2, t1 +; RV32I-NEXT: and a2, a2, t0 ; RV32I-NEXT: srli a3, a1, 24 ; RV32I-NEXT: or a2, a2, a3 ; RV32I-NEXT: slli a3, a1, 8 -; RV32I-NEXT: and a3, a3, a6 +; RV32I-NEXT: and a3, a3, t1 ; RV32I-NEXT: slli a1, a1, 24 ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: or a1, a1, a2 diff --git a/llvm/test/CodeGen/RISCV/rv32zbt.ll b/llvm/test/CodeGen/RISCV/rv32zbt.ll index 6b420dea50e8a..db22bf1709880 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbt.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbt.ll @@ -457,24 +457,24 @@ define i64 @fshl_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32I-NEXT: srl a1, a1, a5 ; RV32I-NEXT: or a1, t0, a1 ; RV32I-NEXT: .LBB13_3: -; RV32I-NEXT: not t2, a4 -; RV32I-NEXT: andi t1, t2, 63 -; RV32I-NEXT: addi a5, t1, -32 -; RV32I-NEXT: srli t0, a3, 1 -; RV32I-NEXT: bltz a5, .LBB13_5 +; RV32I-NEXT: not t0, a4 +; RV32I-NEXT: andi t3, t0, 63 +; RV32I-NEXT: addi t2, t3, -32 +; RV32I-NEXT: srli t1, a3, 1 +; RV32I-NEXT: bltz t2, .LBB13_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: srl a2, t0, a5 +; RV32I-NEXT: srl a2, t1, t2 ; RV32I-NEXT: bltz a7, .LBB13_6 ; RV32I-NEXT: j .LBB13_7 ; RV32I-NEXT: .LBB13_5: -; RV32I-NEXT: srl a5, t0, t2 +; RV32I-NEXT: srl a5, t1, t0 ; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: slli a3, a3, 31 ; RV32I-NEXT: srli a2, a2, 1 ; RV32I-NEXT: or a2, a2, a3 -; RV32I-NEXT: srl a2, a2, t2 -; RV32I-NEXT: sub a3, a6, t1 -; RV32I-NEXT: slli a5, t0, 1 +; RV32I-NEXT: srl a2, a2, t0 +; RV32I-NEXT: sub a3, a6, t3 +; RV32I-NEXT: slli a5, t1, 1 ; RV32I-NEXT: sll a3, a5, a3 ; RV32I-NEXT: or a2, a2, a3 ; RV32I-NEXT: bgez a7, .LBB13_7 @@ -485,78 +485,78 @@ define i64 @fshl_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret ; -; RV32B-LABEL: fshl_i64: -; RV32B: # %bb.0: -; RV32B-NEXT: sll a7, a1, a4 -; RV32B-NEXT: andi a5, a4, 63 -; RV32B-NEXT: addi a6, zero, 31 -; RV32B-NEXT: sub t0, a6, a5 -; RV32B-NEXT: srli a1, a0, 1 -; RV32B-NEXT: srl a1, a1, t0 -; RV32B-NEXT: or a7, a7, a1 -; RV32B-NEXT: addi t1, a5, -32 -; RV32B-NEXT: sll t0, a0, t1 -; RV32B-NEXT: slti a1, t1, 0 -; RV32B-NEXT: cmov t0, a1, a7, t0 -; RV32B-NEXT: not a7, a4 -; RV32B-NEXT: srli t4, a3, 1 -; RV32B-NEXT: srl t2, t4, a7 -; RV32B-NEXT: addi a1, zero, 63 -; RV32B-NEXT: andn t3, a1, a4 -; RV32B-NEXT: addi a5, t3, -32 -; RV32B-NEXT: srai a1, a5, 31 -; RV32B-NEXT: and a1, a1, t2 -; RV32B-NEXT: or a1, t0, a1 -; RV32B-NEXT: fsri a2, a2, a3, 1 -; RV32B-NEXT: srl a7, a2, a7 -; RV32B-NEXT: sub a3, a6, t3 -; RV32B-NEXT: slli a2, t4, 1 -; RV32B-NEXT: sll a2, a2, a3 -; RV32B-NEXT: or a2, a7, a2 -; RV32B-NEXT: srl a3, t4, a5 -; RV32B-NEXT: slti a5, a5, 0 -; RV32B-NEXT: cmov a2, a5, a2, a3 -; RV32B-NEXT: sll a0, a0, a4 -; RV32B-NEXT: srai a3, t1, 31 -; RV32B-NEXT: and a0, a3, a0 -; RV32B-NEXT: or a0, a0, a2 -; RV32B-NEXT: ret -; -; RV32ZBT-LABEL: fshl_i64: -; RV32ZBT: # %bb.0: -; RV32ZBT-NEXT: sll a7, a1, a4 -; RV32ZBT-NEXT: andi a5, a4, 63 -; RV32ZBT-NEXT: addi a6, zero, 31 -; RV32ZBT-NEXT: sub t0, a6, a5 -; RV32ZBT-NEXT: srli a1, a0, 1 -; RV32ZBT-NEXT: srl a1, a1, t0 -; RV32ZBT-NEXT: or a7, a7, a1 -; RV32ZBT-NEXT: addi t1, a5, -32 -; RV32ZBT-NEXT: sll t0, a0, t1 -; RV32ZBT-NEXT: slti a1, t1, 0 -; RV32ZBT-NEXT: cmov t0, a1, a7, t0 -; RV32ZBT-NEXT: not a5, a4 -; RV32ZBT-NEXT: srli a7, a3, 1 -; RV32ZBT-NEXT: srl t4, a7, a5 -; RV32ZBT-NEXT: andi t2, a5, 63 -; RV32ZBT-NEXT: addi t3, t2, -32 -; RV32ZBT-NEXT: srai a1, t3, 31 -; RV32ZBT-NEXT: and a1, a1, t4 -; RV32ZBT-NEXT: or a1, t0, a1 -; RV32ZBT-NEXT: fsri a2, a2, a3, 1 -; RV32ZBT-NEXT: srl a2, a2, a5 -; RV32ZBT-NEXT: sub a3, a6, t2 -; RV32ZBT-NEXT: slli a5, a7, 1 -; RV32ZBT-NEXT: sll a3, a5, a3 -; RV32ZBT-NEXT: or a2, a2, a3 -; RV32ZBT-NEXT: srl a3, a7, t3 -; RV32ZBT-NEXT: slti a5, t3, 0 -; RV32ZBT-NEXT: cmov a2, a5, a2, a3 -; RV32ZBT-NEXT: sll a0, a0, a4 -; RV32ZBT-NEXT: srai a3, t1, 31 -; RV32ZBT-NEXT: and a0, a3, a0 -; RV32ZBT-NEXT: or a0, a0, a2 -; RV32ZBT-NEXT: ret +; RV32IB-LABEL: fshl_i64: +; RV32IB: # %bb.0: +; RV32IB-NEXT: sll a7, a1, a4 +; RV32IB-NEXT: andi a5, a4, 63 +; RV32IB-NEXT: addi a6, zero, 31 +; RV32IB-NEXT: sub t0, a6, a5 +; RV32IB-NEXT: srli a1, a0, 1 +; RV32IB-NEXT: srl a1, a1, t0 +; RV32IB-NEXT: or t0, a7, a1 +; RV32IB-NEXT: addi a7, a5, -32 +; RV32IB-NEXT: sll a5, a0, a7 +; RV32IB-NEXT: slti a1, a7, 0 +; RV32IB-NEXT: cmov t1, a1, t0, a5 +; RV32IB-NEXT: not t0, a4 +; RV32IB-NEXT: srli a5, a3, 1 +; RV32IB-NEXT: srl t2, a5, t0 +; RV32IB-NEXT: addi a1, zero, 63 +; RV32IB-NEXT: andn t3, a1, a4 +; RV32IB-NEXT: addi t4, t3, -32 +; RV32IB-NEXT: srai a1, t4, 31 +; RV32IB-NEXT: and a1, a1, t2 +; RV32IB-NEXT: or a1, t1, a1 +; RV32IB-NEXT: fsri a2, a2, a3, 1 +; RV32IB-NEXT: srl t0, a2, t0 +; RV32IB-NEXT: sub a3, a6, t3 +; RV32IB-NEXT: slli a2, a5, 1 +; RV32IB-NEXT: sll a2, a2, a3 +; RV32IB-NEXT: or a2, t0, a2 +; RV32IB-NEXT: srl a3, a5, t4 +; RV32IB-NEXT: slti a5, t4, 0 +; RV32IB-NEXT: cmov a2, a5, a2, a3 +; RV32IB-NEXT: sll a0, a0, a4 +; RV32IB-NEXT: srai a3, a7, 31 +; RV32IB-NEXT: and a0, a3, a0 +; RV32IB-NEXT: or a0, a0, a2 +; RV32IB-NEXT: ret +; +; RV32IBT-LABEL: fshl_i64: +; RV32IBT: # %bb.0: +; RV32IBT-NEXT: sll a7, a1, a4 +; RV32IBT-NEXT: andi a5, a4, 63 +; RV32IBT-NEXT: addi a6, zero, 31 +; RV32IBT-NEXT: sub t0, a6, a5 +; RV32IBT-NEXT: srli a1, a0, 1 +; RV32IBT-NEXT: srl a1, a1, t0 +; RV32IBT-NEXT: or t0, a7, a1 +; RV32IBT-NEXT: addi a7, a5, -32 +; RV32IBT-NEXT: sll a5, a0, a7 +; RV32IBT-NEXT: slti a1, a7, 0 +; RV32IBT-NEXT: cmov t1, a1, t0, a5 +; RV32IBT-NEXT: not t0, a4 +; RV32IBT-NEXT: srli a5, a3, 1 +; RV32IBT-NEXT: srl t4, a5, t0 +; RV32IBT-NEXT: andi t2, t0, 63 +; RV32IBT-NEXT: addi t3, t2, -32 +; RV32IBT-NEXT: srai a1, t3, 31 +; RV32IBT-NEXT: and a1, a1, t4 +; RV32IBT-NEXT: or a1, t1, a1 +; RV32IBT-NEXT: fsri a2, a2, a3, 1 +; RV32IBT-NEXT: srl t0, a2, t0 +; RV32IBT-NEXT: sub a3, a6, t2 +; RV32IBT-NEXT: slli a2, a5, 1 +; RV32IBT-NEXT: sll a2, a2, a3 +; RV32IBT-NEXT: or a2, t0, a2 +; RV32IBT-NEXT: srl a3, a5, t3 +; RV32IBT-NEXT: slti a5, t3, 0 +; RV32IBT-NEXT: cmov a2, a5, a2, a3 +; RV32IBT-NEXT: sll a0, a0, a4 +; RV32IBT-NEXT: srai a3, a7, 31 +; RV32IBT-NEXT: and a0, a3, a0 +; RV32IBT-NEXT: or a0, a0, a2 +; RV32IBT-NEXT: ret %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c) ret i64 %1 } @@ -599,7 +599,7 @@ declare i64 @llvm.fshr.i64(i64, i64, i64) define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32I-LABEL: fshr_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: mv t0, a0 +; RV32I-NEXT: mv t1, a0 ; RV32I-NEXT: andi a0, a4, 63 ; RV32I-NEXT: addi a6, a0, -32 ; RV32I-NEXT: addi a7, zero, 31 @@ -614,27 +614,27 @@ define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32I-NEXT: sll a0, a5, a0 ; RV32I-NEXT: or a0, a2, a0 ; RV32I-NEXT: .LBB15_3: -; RV32I-NEXT: not t2, a4 -; RV32I-NEXT: andi a5, t2, 63 -; RV32I-NEXT: addi a2, a5, -32 -; RV32I-NEXT: slli t1, t0, 1 -; RV32I-NEXT: bltz a2, .LBB15_5 +; RV32I-NEXT: not t0, a4 +; RV32I-NEXT: andi a2, t0, 63 +; RV32I-NEXT: addi t2, a2, -32 +; RV32I-NEXT: slli a5, t1, 1 +; RV32I-NEXT: bltz t2, .LBB15_5 ; RV32I-NEXT: # %bb.4: -; RV32I-NEXT: sll a1, t1, a2 +; RV32I-NEXT: sll a1, a5, t2 ; RV32I-NEXT: bltz a6, .LBB15_6 ; RV32I-NEXT: j .LBB15_7 ; RV32I-NEXT: .LBB15_5: -; RV32I-NEXT: sll a2, t1, t2 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: lui a2, 524288 -; RV32I-NEXT: addi a2, a2, -1 -; RV32I-NEXT: and a2, t0, a2 -; RV32I-NEXT: sub a5, a7, a5 -; RV32I-NEXT: srl a2, a2, a5 -; RV32I-NEXT: srli a5, t0, 31 +; RV32I-NEXT: sll a5, a5, t0 +; RV32I-NEXT: or a0, a0, a5 +; RV32I-NEXT: lui a5, 524288 +; RV32I-NEXT: addi a5, a5, -1 +; RV32I-NEXT: and a5, t1, a5 +; RV32I-NEXT: sub a2, a7, a2 +; RV32I-NEXT: srl a2, a5, a2 +; RV32I-NEXT: srli a5, t1, 31 ; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: sll a1, a1, t2 +; RV32I-NEXT: sll a1, a1, t0 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: bgez a6, .LBB15_7 ; RV32I-NEXT: .LBB15_6: @@ -643,82 +643,82 @@ define i64 @fshr_i64(i64 %a, i64 %b, i64 %c) nounwind { ; RV32I-NEXT: .LBB15_7: ; RV32I-NEXT: ret ; -; RV32B-LABEL: fshr_i64: -; RV32B: # %bb.0: -; RV32B-NEXT: srl a7, a2, a4 -; RV32B-NEXT: andi a5, a4, 63 -; RV32B-NEXT: addi a6, zero, 31 -; RV32B-NEXT: sub t0, a6, a5 -; RV32B-NEXT: slli a2, a3, 1 -; RV32B-NEXT: sll a2, a2, t0 -; RV32B-NEXT: or a7, a7, a2 -; RV32B-NEXT: addi t2, a5, -32 -; RV32B-NEXT: srl t0, a3, t2 -; RV32B-NEXT: slti a2, t2, 0 -; RV32B-NEXT: cmov a7, a2, a7, t0 -; RV32B-NEXT: not t3, a4 -; RV32B-NEXT: slli t0, a0, 1 -; RV32B-NEXT: sll t1, t0, t3 -; RV32B-NEXT: addi a5, zero, 63 -; RV32B-NEXT: andn t4, a5, a4 -; RV32B-NEXT: addi a2, t4, -32 -; RV32B-NEXT: srai a5, a2, 31 -; RV32B-NEXT: and a5, a5, t1 -; RV32B-NEXT: or a7, a5, a7 -; RV32B-NEXT: fsri a1, a0, a1, 31 -; RV32B-NEXT: sll a1, a1, t3 -; RV32B-NEXT: sub a5, a6, t4 -; RV32B-NEXT: bclri a0, a0, 31 -; RV32B-NEXT: srl a0, a0, a5 -; RV32B-NEXT: or a0, a1, a0 -; RV32B-NEXT: sll a1, t0, a2 -; RV32B-NEXT: slti a2, a2, 0 -; RV32B-NEXT: cmov a0, a2, a0, a1 -; RV32B-NEXT: srl a1, a3, a4 -; RV32B-NEXT: srai a2, t2, 31 -; RV32B-NEXT: and a1, a2, a1 -; RV32B-NEXT: or a1, a0, a1 -; RV32B-NEXT: mv a0, a7 -; RV32B-NEXT: ret -; -; RV32ZBT-LABEL: fshr_i64: -; RV32ZBT: # %bb.0: -; RV32ZBT-NEXT: srl a7, a2, a4 -; RV32ZBT-NEXT: andi a5, a4, 63 -; RV32ZBT-NEXT: addi a6, zero, 31 -; RV32ZBT-NEXT: sub t0, a6, a5 -; RV32ZBT-NEXT: slli a2, a3, 1 -; RV32ZBT-NEXT: sll a2, a2, t0 -; RV32ZBT-NEXT: or a7, a7, a2 -; RV32ZBT-NEXT: addi t2, a5, -32 -; RV32ZBT-NEXT: srl t0, a3, t2 -; RV32ZBT-NEXT: slti a2, t2, 0 -; RV32ZBT-NEXT: cmov a7, a2, a7, t0 -; RV32ZBT-NEXT: not t4, a4 -; RV32ZBT-NEXT: slli t0, a0, 1 -; RV32ZBT-NEXT: sll t1, t0, t4 -; RV32ZBT-NEXT: andi t3, t4, 63 -; RV32ZBT-NEXT: addi a5, t3, -32 -; RV32ZBT-NEXT: srai a2, a5, 31 -; RV32ZBT-NEXT: and a2, a2, t1 -; RV32ZBT-NEXT: or a7, a2, a7 -; RV32ZBT-NEXT: lui a2, 524288 -; RV32ZBT-NEXT: addi a2, a2, -1 -; RV32ZBT-NEXT: and t1, a0, a2 -; RV32ZBT-NEXT: sub a2, a6, t3 -; RV32ZBT-NEXT: srl a2, t1, a2 -; RV32ZBT-NEXT: fsri a0, a0, a1, 31 -; RV32ZBT-NEXT: sll a0, a0, t4 -; RV32ZBT-NEXT: or a0, a0, a2 -; RV32ZBT-NEXT: sll a1, t0, a5 -; RV32ZBT-NEXT: slti a2, a5, 0 -; RV32ZBT-NEXT: cmov a0, a2, a0, a1 -; RV32ZBT-NEXT: srl a1, a3, a4 -; RV32ZBT-NEXT: srai a2, t2, 31 -; RV32ZBT-NEXT: and a1, a2, a1 -; RV32ZBT-NEXT: or a1, a0, a1 -; RV32ZBT-NEXT: mv a0, a7 -; RV32ZBT-NEXT: ret +; RV32IB-LABEL: fshr_i64: +; RV32IB: # %bb.0: +; RV32IB-NEXT: srl a7, a2, a4 +; RV32IB-NEXT: andi a5, a4, 63 +; RV32IB-NEXT: addi a6, zero, 31 +; RV32IB-NEXT: sub t0, a6, a5 +; RV32IB-NEXT: slli a2, a3, 1 +; RV32IB-NEXT: sll a2, a2, t0 +; RV32IB-NEXT: or t0, a7, a2 +; RV32IB-NEXT: addi a7, a5, -32 +; RV32IB-NEXT: srl a5, a3, a7 +; RV32IB-NEXT: slti a2, a7, 0 +; RV32IB-NEXT: cmov t1, a2, t0, a5 +; RV32IB-NEXT: not t0, a4 +; RV32IB-NEXT: slli t4, a0, 1 +; RV32IB-NEXT: sll t2, t4, t0 +; RV32IB-NEXT: addi a2, zero, 63 +; RV32IB-NEXT: andn a2, a2, a4 +; RV32IB-NEXT: addi t3, a2, -32 +; RV32IB-NEXT: srai a5, t3, 31 +; RV32IB-NEXT: and a5, a5, t2 +; RV32IB-NEXT: or t1, a5, t1 +; RV32IB-NEXT: fsri a1, a0, a1, 31 +; RV32IB-NEXT: sll a1, a1, t0 +; RV32IB-NEXT: sub a2, a6, a2 +; RV32IB-NEXT: bclri a0, a0, 31 +; RV32IB-NEXT: srl a0, a0, a2 +; RV32IB-NEXT: or a0, a1, a0 +; RV32IB-NEXT: sll a1, t4, t3 +; RV32IB-NEXT: slti a2, t3, 0 +; RV32IB-NEXT: cmov a0, a2, a0, a1 +; RV32IB-NEXT: srl a1, a3, a4 +; RV32IB-NEXT: srai a2, a7, 31 +; RV32IB-NEXT: and a1, a2, a1 +; RV32IB-NEXT: or a1, a0, a1 +; RV32IB-NEXT: mv a0, t1 +; RV32IB-NEXT: ret +; +; RV32IBT-LABEL: fshr_i64: +; RV32IBT: # %bb.0: +; RV32IBT-NEXT: srl a7, a2, a4 +; RV32IBT-NEXT: andi a5, a4, 63 +; RV32IBT-NEXT: addi a6, zero, 31 +; RV32IBT-NEXT: sub t0, a6, a5 +; RV32IBT-NEXT: slli a2, a3, 1 +; RV32IBT-NEXT: sll a2, a2, t0 +; RV32IBT-NEXT: or t0, a7, a2 +; RV32IBT-NEXT: addi a7, a5, -32 +; RV32IBT-NEXT: srl a5, a3, a7 +; RV32IBT-NEXT: slti a2, a7, 0 +; RV32IBT-NEXT: cmov t1, a2, t0, a5 +; RV32IBT-NEXT: not t0, a4 +; RV32IBT-NEXT: slli t4, a0, 1 +; RV32IBT-NEXT: sll t2, t4, t0 +; RV32IBT-NEXT: andi a2, t0, 63 +; RV32IBT-NEXT: addi t3, a2, -32 +; RV32IBT-NEXT: srai a5, t3, 31 +; RV32IBT-NEXT: and a5, a5, t2 +; RV32IBT-NEXT: or t1, a5, t1 +; RV32IBT-NEXT: lui a5, 524288 +; RV32IBT-NEXT: addi a5, a5, -1 +; RV32IBT-NEXT: and a5, a0, a5 +; RV32IBT-NEXT: sub a2, a6, a2 +; RV32IBT-NEXT: srl a2, a5, a2 +; RV32IBT-NEXT: fsri a0, a0, a1, 31 +; RV32IBT-NEXT: sll a0, a0, t0 +; RV32IBT-NEXT: or a0, a0, a2 +; RV32IBT-NEXT: sll a1, t4, t3 +; RV32IBT-NEXT: slti a2, t3, 0 +; RV32IBT-NEXT: cmov a0, a2, a0, a1 +; RV32IBT-NEXT: srl a1, a3, a4 +; RV32IBT-NEXT: srai a2, a7, 31 +; RV32IBT-NEXT: and a1, a2, a1 +; RV32IBT-NEXT: or a1, a0, a1 +; RV32IBT-NEXT: mv a0, t1 +; RV32IBT-NEXT: ret %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c) ret i64 %1 } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll index b0ba851f143bb..c1a9fe20aa93e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll @@ -1072,18 +1072,18 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV32-LABEL: bitreverse_v8i32: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: addi a7, a0, 16 -; LMULMAX1-RV32-NEXT: vle32.v v25, (a7) +; LMULMAX1-RV32-NEXT: addi a6, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a6) ; LMULMAX1-RV32-NEXT: vle32.v v26, (a0) ; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 8 ; LMULMAX1-RV32-NEXT: lui a2, 16 -; LMULMAX1-RV32-NEXT: addi t0, a2, -256 -; LMULMAX1-RV32-NEXT: vand.vx v27, v27, t0 +; LMULMAX1-RV32-NEXT: addi a7, a2, -256 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a7 ; LMULMAX1-RV32-NEXT: vsrl.vi v28, v25, 24 ; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v28 ; LMULMAX1-RV32-NEXT: vsll.vi v28, v25, 8 -; LMULMAX1-RV32-NEXT: lui a6, 4080 -; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a6 +; LMULMAX1-RV32-NEXT: lui t0, 4080 +; LMULMAX1-RV32-NEXT: vand.vx v28, v28, t0 ; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 24 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 @@ -1096,18 +1096,18 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a5 ; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV32-NEXT: lui a3, 209715 -; LMULMAX1-RV32-NEXT: addi a3, a3, 819 -; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a3 +; LMULMAX1-RV32-NEXT: lui a1, 209715 +; LMULMAX1-RV32-NEXT: addi a1, a1, 819 +; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a1 ; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV32-NEXT: lui a1, 838861 -; LMULMAX1-RV32-NEXT: addi a1, a1, -820 -; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: lui a2, 838861 +; LMULMAX1-RV32-NEXT: addi a2, a2, -820 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a2 ; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV32-NEXT: lui a2, 349525 -; LMULMAX1-RV32-NEXT: addi a2, a2, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a2 +; LMULMAX1-RV32-NEXT: lui a3, 349525 +; LMULMAX1-RV32-NEXT: addi a3, a3, 1365 +; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a3 ; LMULMAX1-RV32-NEXT: vadd.vv v27, v27, v27 ; LMULMAX1-RV32-NEXT: lui a4, 699051 ; LMULMAX1-RV32-NEXT: addi a4, a4, -1366 @@ -1115,11 +1115,11 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 ; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 8 -; LMULMAX1-RV32-NEXT: vand.vx v27, v27, t0 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a7 ; LMULMAX1-RV32-NEXT: vsrl.vi v28, v26, 24 ; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v28 ; LMULMAX1-RV32-NEXT: vsll.vi v28, v26, 8 -; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a6 +; LMULMAX1-RV32-NEXT: vand.vx v28, v28, t0 ; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 24 ; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v28 ; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 @@ -1128,97 +1128,97 @@ define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a5 ; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 4 ; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a3 +; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a1 ; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a2 ; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 2 ; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a2 +; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a3 ; LMULMAX1-RV32-NEXT: vadd.vv v27, v27, v27 ; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a4 ; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 1 ; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 ; LMULMAX1-RV32-NEXT: vse32.v v26, (a0) -; LMULMAX1-RV32-NEXT: vse32.v v25, (a7) +; LMULMAX1-RV32-NEXT: vse32.v v25, (a6) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bitreverse_v8i32: ; LMULMAX1-RV64: # %bb.0: ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV64-NEXT: addi a7, a0, 16 -; LMULMAX1-RV64-NEXT: vle32.v v25, (a7) +; LMULMAX1-RV64-NEXT: addi a6, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v25, (a6) ; LMULMAX1-RV64-NEXT: vle32.v v26, (a0) ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 8 ; LMULMAX1-RV64-NEXT: lui a2, 16 -; LMULMAX1-RV64-NEXT: addiw t0, a2, -256 -; LMULMAX1-RV64-NEXT: vand.vx v27, v27, t0 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -256 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a2 ; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 24 ; LMULMAX1-RV64-NEXT: vor.vv v27, v27, v28 ; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 8 -; LMULMAX1-RV64-NEXT: lui a6, 4080 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6 +; LMULMAX1-RV64-NEXT: lui a7, 4080 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7 ; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 24 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 ; LMULMAX1-RV64-NEXT: lui a4, 61681 -; LMULMAX1-RV64-NEXT: addiw t1, a4, -241 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, t1 +; LMULMAX1-RV64-NEXT: addiw a4, a4, -241 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4 ; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 ; LMULMAX1-RV64-NEXT: lui a5, 241 ; LMULMAX1-RV64-NEXT: addiw a5, a5, -241 ; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, 240 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5 +; LMULMAX1-RV64-NEXT: addi t0, a5, 240 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t0 ; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: lui a3, 209715 -; LMULMAX1-RV64-NEXT: addiw a3, a3, 819 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a3 +; LMULMAX1-RV64-NEXT: lui a1, 209715 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a1 ; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV64-NEXT: lui a1, 205 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -819 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -820 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: lui a3, 205 +; LMULMAX1-RV64-NEXT: addiw a3, a3, -819 +; LMULMAX1-RV64-NEXT: slli a3, a3, 12 +; LMULMAX1-RV64-NEXT: addi t1, a3, -820 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t1 ; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: lui a2, 349525 -; LMULMAX1-RV64-NEXT: addiw a2, a2, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a2 +; LMULMAX1-RV64-NEXT: lui a5, 349525 +; LMULMAX1-RV64-NEXT: addiw a5, a5, 1365 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a5 ; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27 -; LMULMAX1-RV64-NEXT: lui a4, 171 -; LMULMAX1-RV64-NEXT: addiw a4, a4, -1365 -; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, -1366 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a4 +; LMULMAX1-RV64-NEXT: lui a3, 171 +; LMULMAX1-RV64-NEXT: addiw a3, a3, -1365 +; LMULMAX1-RV64-NEXT: slli a3, a3, 12 +; LMULMAX1-RV64-NEXT: addi a3, a3, -1366 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3 ; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 ; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 8 -; LMULMAX1-RV64-NEXT: vand.vx v27, v27, t0 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a2 ; LMULMAX1-RV64-NEXT: vsrl.vi v28, v26, 24 ; LMULMAX1-RV64-NEXT: vor.vv v27, v27, v28 ; LMULMAX1-RV64-NEXT: vsll.vi v28, v26, 8 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7 ; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 24 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v28 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, t1 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4 ; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t0 ; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a3 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a1 ; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t1 ; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a2 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a5 ; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a4 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3 ; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 ; LMULMAX1-RV64-NEXT: vse32.v v26, (a0) -; LMULMAX1-RV64-NEXT: vse32.v v25, (a7) +; LMULMAX1-RV64-NEXT: vse32.v v25, (a6) ; LMULMAX1-RV64-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = load <8 x i32>, <8 x i32>* %y @@ -1432,8 +1432,8 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-LABEL: bitreverse_v4i64: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v30, (a1) +; LMULMAX1-RV32-NEXT: addi a6, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v30, (a6) ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV32-NEXT: addi a2, zero, 56 ; LMULMAX1-RV32-NEXT: vsrl.vx v26, v30, a2 @@ -1444,23 +1444,23 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a4 ; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v26 ; LMULMAX1-RV32-NEXT: vsrl.vi v26, v30, 24 -; LMULMAX1-RV32-NEXT: lui a6, 4080 -; LMULMAX1-RV32-NEXT: vand.vx v28, v26, a6 -; LMULMAX1-RV32-NEXT: addi a5, zero, 5 +; LMULMAX1-RV32-NEXT: lui a5, 4080 +; LMULMAX1-RV32-NEXT: vand.vx v28, v26, a5 +; LMULMAX1-RV32-NEXT: addi a1, zero, 5 ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; LMULMAX1-RV32-NEXT: vmv.s.x v0, a5 +; LMULMAX1-RV32-NEXT: vmv.s.x v0, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; LMULMAX1-RV32-NEXT: vmv.v.i v26, 0 -; LMULMAX1-RV32-NEXT: lui a5, 1044480 -; LMULMAX1-RV32-NEXT: vmerge.vxm v26, v26, a5, v0 +; LMULMAX1-RV32-NEXT: lui a1, 1044480 +; LMULMAX1-RV32-NEXT: vmerge.vxm v26, v26, a1, v0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vsrl.vi v29, v30, 8 ; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v26 ; LMULMAX1-RV32-NEXT: vor.vv v28, v29, v28 ; LMULMAX1-RV32-NEXT: vor.vv v31, v28, v27 -; LMULMAX1-RV32-NEXT: addi a5, zero, 255 +; LMULMAX1-RV32-NEXT: addi a1, zero, 255 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v27, a5 +; LMULMAX1-RV32-NEXT: vmv.v.x v27, a1 ; LMULMAX1-RV32-NEXT: vmerge.vim v27, v27, 0, v0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vsll.vi v28, v30, 8 @@ -1474,7 +1474,7 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v29 ; LMULMAX1-RV32-NEXT: vsll.vx v9, v30, a3 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v29, a6 +; LMULMAX1-RV32-NEXT: vmv.v.x v29, a5 ; LMULMAX1-RV32-NEXT: vmerge.vim v29, v29, 0, v0 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v29 @@ -1482,47 +1482,47 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vor.vv v30, v30, v9 ; LMULMAX1-RV32-NEXT: vor.vv v30, v30, v8 ; LMULMAX1-RV32-NEXT: vor.vv v31, v30, v31 -; LMULMAX1-RV32-NEXT: lui a5, 61681 -; LMULMAX1-RV32-NEXT: addi a5, a5, -241 +; LMULMAX1-RV32-NEXT: lui a1, 61681 +; LMULMAX1-RV32-NEXT: addi a1, a1, -241 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v30, a5 +; LMULMAX1-RV32-NEXT: vmv.v.x v30, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vand.vv v8, v31, v30 ; LMULMAX1-RV32-NEXT: vsll.vi v8, v8, 4 -; LMULMAX1-RV32-NEXT: lui a5, 986895 -; LMULMAX1-RV32-NEXT: addi a5, a5, 240 +; LMULMAX1-RV32-NEXT: lui a1, 986895 +; LMULMAX1-RV32-NEXT: addi a1, a1, 240 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v9, a5 +; LMULMAX1-RV32-NEXT: vmv.v.x v9, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vand.vv v31, v31, v9 ; LMULMAX1-RV32-NEXT: vsrl.vi v31, v31, 4 ; LMULMAX1-RV32-NEXT: vor.vv v31, v31, v8 -; LMULMAX1-RV32-NEXT: lui a5, 209715 -; LMULMAX1-RV32-NEXT: addi a5, a5, 819 +; LMULMAX1-RV32-NEXT: lui a1, 209715 +; LMULMAX1-RV32-NEXT: addi a1, a1, 819 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v8, a5 +; LMULMAX1-RV32-NEXT: vmv.v.x v8, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vand.vv v10, v31, v8 ; LMULMAX1-RV32-NEXT: vsll.vi v10, v10, 2 -; LMULMAX1-RV32-NEXT: lui a5, 838861 -; LMULMAX1-RV32-NEXT: addi a5, a5, -820 +; LMULMAX1-RV32-NEXT: lui a1, 838861 +; LMULMAX1-RV32-NEXT: addi a1, a1, -820 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v11, a5 +; LMULMAX1-RV32-NEXT: vmv.v.x v11, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vand.vv v31, v31, v11 ; LMULMAX1-RV32-NEXT: vsrl.vi v31, v31, 2 ; LMULMAX1-RV32-NEXT: vor.vv v31, v31, v10 -; LMULMAX1-RV32-NEXT: lui a5, 349525 -; LMULMAX1-RV32-NEXT: addi a5, a5, 1365 +; LMULMAX1-RV32-NEXT: lui a1, 349525 +; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v10, a5 +; LMULMAX1-RV32-NEXT: vmv.v.x v10, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vand.vv v12, v31, v10 ; LMULMAX1-RV32-NEXT: vadd.vv v12, v12, v12 -; LMULMAX1-RV32-NEXT: lui a5, 699051 -; LMULMAX1-RV32-NEXT: addi a5, a5, -1366 +; LMULMAX1-RV32-NEXT: lui a1, 699051 +; LMULMAX1-RV32-NEXT: addi a1, a1, -1366 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vmv.v.x v13, a5 +; LMULMAX1-RV32-NEXT: vmv.v.x v13, a1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vand.vv v31, v31, v13 ; LMULMAX1-RV32-NEXT: vsrl.vi v31, v31, 1 @@ -1532,7 +1532,7 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vand.vx v14, v14, a4 ; LMULMAX1-RV32-NEXT: vor.vv v12, v14, v12 ; LMULMAX1-RV32-NEXT: vsrl.vi v14, v25, 24 -; LMULMAX1-RV32-NEXT: vand.vx v14, v14, a6 +; LMULMAX1-RV32-NEXT: vand.vx v14, v14, a5 ; LMULMAX1-RV32-NEXT: vsrl.vi v15, v25, 8 ; LMULMAX1-RV32-NEXT: vand.vv v26, v15, v26 ; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v14 @@ -1564,7 +1564,7 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 ; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v31, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v31, (a6) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bitreverse_v4i64: @@ -1574,37 +1574,37 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill ; LMULMAX1-RV64-NEXT: .cfi_offset s0, -8 ; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; LMULMAX1-RV64-NEXT: addi t1, a0, 16 -; LMULMAX1-RV64-NEXT: vle64.v v26, (t1) +; LMULMAX1-RV64-NEXT: addi a6, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a6) ; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi a7, zero, 56 -; LMULMAX1-RV64-NEXT: vsrl.vx v27, v26, a7 -; LMULMAX1-RV64-NEXT: addi t0, zero, 40 -; LMULMAX1-RV64-NEXT: vsrl.vx v28, v26, t0 +; LMULMAX1-RV64-NEXT: addi t0, zero, 56 +; LMULMAX1-RV64-NEXT: vsrl.vx v27, v26, t0 +; LMULMAX1-RV64-NEXT: addi t1, zero, 40 +; LMULMAX1-RV64-NEXT: vsrl.vx v28, v26, t1 ; LMULMAX1-RV64-NEXT: lui a1, 16 -; LMULMAX1-RV64-NEXT: addiw t2, a1, -256 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t2 +; LMULMAX1-RV64-NEXT: addiw t4, a1, -256 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4 ; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV64-NEXT: vsrl.vi v28, v26, 24 -; LMULMAX1-RV64-NEXT: lui a6, 4080 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6 +; LMULMAX1-RV64-NEXT: lui a7, 4080 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7 ; LMULMAX1-RV64-NEXT: vsrl.vi v29, v26, 8 -; LMULMAX1-RV64-NEXT: addi a1, zero, 255 -; LMULMAX1-RV64-NEXT: slli t3, a1, 24 -; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t3 +; LMULMAX1-RV64-NEXT: addi a3, zero, 255 +; LMULMAX1-RV64-NEXT: slli a1, a3, 24 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a1 ; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 ; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV64-NEXT: vsll.vi v28, v26, 8 -; LMULMAX1-RV64-NEXT: slli t4, a1, 32 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4 +; LMULMAX1-RV64-NEXT: slli a5, a3, 32 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a5 ; LMULMAX1-RV64-NEXT: vsll.vi v29, v26, 24 -; LMULMAX1-RV64-NEXT: slli t5, a1, 40 -; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t5 +; LMULMAX1-RV64-NEXT: slli a2, a3, 40 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a2 ; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 -; LMULMAX1-RV64-NEXT: vsll.vx v29, v26, a7 -; LMULMAX1-RV64-NEXT: vsll.vx v26, v26, t0 -; LMULMAX1-RV64-NEXT: slli t6, a1, 48 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t6 +; LMULMAX1-RV64-NEXT: vsll.vx v29, v26, t0 +; LMULMAX1-RV64-NEXT: vsll.vx v26, v26, t1 +; LMULMAX1-RV64-NEXT: slli a3, a3, 48 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3 ; LMULMAX1-RV64-NEXT: vor.vv v26, v29, v26 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v28 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 @@ -1615,50 +1615,50 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 ; LMULMAX1-RV64-NEXT: addi a4, a4, 241 ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, -241 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4 +; LMULMAX1-RV64-NEXT: addi t2, a4, -241 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, t2 ; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV64-NEXT: lui a5, 1044721 -; LMULMAX1-RV64-NEXT: addiw a5, a5, -241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, 241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, -241 -; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, 240 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5 +; LMULMAX1-RV64-NEXT: lui a4, 1044721 +; LMULMAX1-RV64-NEXT: addiw a4, a4, -241 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, 241 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, -241 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi t3, a4, 240 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t3 ; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: lui a2, 13107 -; LMULMAX1-RV64-NEXT: addiw a2, a2, 819 -; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a2, a2, 819 -; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a2, a2, 819 -; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a2, a2, 819 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a2 +; LMULMAX1-RV64-NEXT: lui a4, 13107 +; LMULMAX1-RV64-NEXT: addiw a4, a4, 819 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, 819 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, 819 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi t5, a4, 819 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, t5 ; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV64-NEXT: lui a3, 1035469 -; LMULMAX1-RV64-NEXT: addiw a3, a3, -819 -; LMULMAX1-RV64-NEXT: slli a3, a3, 12 -; LMULMAX1-RV64-NEXT: addi a3, a3, -819 -; LMULMAX1-RV64-NEXT: slli a3, a3, 12 -; LMULMAX1-RV64-NEXT: addi a3, a3, -819 -; LMULMAX1-RV64-NEXT: slli a3, a3, 12 -; LMULMAX1-RV64-NEXT: addi a3, a3, -820 -; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3 +; LMULMAX1-RV64-NEXT: lui a4, 1035469 +; LMULMAX1-RV64-NEXT: addiw a4, a4, -819 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, -819 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, -819 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi t6, a4, -820 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t6 ; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: lui a1, 21845 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a1 +; LMULMAX1-RV64-NEXT: lui a4, 21845 +; LMULMAX1-RV64-NEXT: addiw a4, a4, 1365 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, 1365 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, 1365 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, 1365 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4 ; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27 ; LMULMAX1-RV64-NEXT: lui s0, 1026731 ; LMULMAX1-RV64-NEXT: addiw s0, s0, -1365 @@ -1671,44 +1671,44 @@ define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: vand.vx v26, v26, s0 ; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1 ; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: vsrl.vx v27, v25, a7 -; LMULMAX1-RV64-NEXT: vsrl.vx v28, v25, t0 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t2 +; LMULMAX1-RV64-NEXT: vsrl.vx v27, v25, t0 +; LMULMAX1-RV64-NEXT: vsrl.vx v28, v25, t1 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4 ; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 24 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a7 ; LMULMAX1-RV64-NEXT: vsrl.vi v29, v25, 8 -; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t3 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a1 ; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 ; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 ; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 8 -; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a5 ; LMULMAX1-RV64-NEXT: vsll.vi v29, v25, 24 -; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t5 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, a2 ; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 -; LMULMAX1-RV64-NEXT: vsll.vx v29, v25, a7 -; LMULMAX1-RV64-NEXT: vsll.vx v25, v25, t0 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t6 +; LMULMAX1-RV64-NEXT: vsll.vx v29, v25, t0 +; LMULMAX1-RV64-NEXT: vsll.vx v25, v25, t1 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3 ; LMULMAX1-RV64-NEXT: vor.vv v25, v29, v25 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, t2 ; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t3 ; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a2 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, t5 ; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t6 ; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a1 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4 ; LMULMAX1-RV64-NEXT: vadd.vv v27, v27, v27 ; LMULMAX1-RV64-NEXT: vand.vx v25, v25, s0 ; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 ; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 ; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vse64.v v26, (t1) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a6) ; LMULMAX1-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload ; LMULMAX1-RV64-NEXT: addi sp, sp, 16 ; LMULMAX1-RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll index 554b2f179b441..05fca7255b9ec 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -562,13 +562,13 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: srli a2, a1, 8 ; LMULMAX2-RV32-NEXT: lui a3, 16 -; LMULMAX2-RV32-NEXT: addi a3, a3, -256 -; LMULMAX2-RV32-NEXT: and a2, a2, a3 +; LMULMAX2-RV32-NEXT: addi a6, a3, -256 +; LMULMAX2-RV32-NEXT: and a2, a2, a6 ; LMULMAX2-RV32-NEXT: srli a4, a1, 24 ; LMULMAX2-RV32-NEXT: or a2, a2, a4 ; LMULMAX2-RV32-NEXT: slli a4, a1, 8 -; LMULMAX2-RV32-NEXT: lui a6, 4080 -; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: lui a5, 4080 +; LMULMAX2-RV32-NEXT: and a4, a4, a5 ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a4 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 @@ -577,11 +577,11 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: and a2, a2, a3 +; LMULMAX2-RV32-NEXT: and a2, a2, a6 ; LMULMAX2-RV32-NEXT: srli a4, a1, 24 ; LMULMAX2-RV32-NEXT: or a2, a2, a4 ; LMULMAX2-RV32-NEXT: slli a4, a1, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: and a4, a4, a5 ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a4 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 @@ -590,23 +590,23 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vsrl.vx v25, v25, a1 ; LMULMAX2-RV32-NEXT: vmv.x.s a2, v25 ; LMULMAX2-RV32-NEXT: srli a4, a2, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a3 -; LMULMAX2-RV32-NEXT: srli a5, a2, 24 -; LMULMAX2-RV32-NEXT: or a4, a4, a5 -; LMULMAX2-RV32-NEXT: slli a5, a2, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: srli a3, a2, 24 +; LMULMAX2-RV32-NEXT: or a3, a4, a3 +; LMULMAX2-RV32-NEXT: slli a4, a2, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a5 ; LMULMAX2-RV32-NEXT: slli a2, a2, 24 -; LMULMAX2-RV32-NEXT: or a2, a2, a5 ; LMULMAX2-RV32-NEXT: or a2, a2, a4 +; LMULMAX2-RV32-NEXT: or a2, a2, a3 ; LMULMAX2-RV32-NEXT: sw a2, 16(sp) ; LMULMAX2-RV32-NEXT: vsrl.vx v25, v26, a1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV32-NEXT: srli a2, a1, 8 -; LMULMAX2-RV32-NEXT: and a2, a2, a3 +; LMULMAX2-RV32-NEXT: and a2, a2, a6 ; LMULMAX2-RV32-NEXT: srli a3, a1, 24 ; LMULMAX2-RV32-NEXT: or a2, a2, a3 ; LMULMAX2-RV32-NEXT: slli a3, a1, 8 -; LMULMAX2-RV32-NEXT: and a3, a3, a6 +; LMULMAX2-RV32-NEXT: and a3, a3, a5 ; LMULMAX2-RV32-NEXT: slli a1, a1, 24 ; LMULMAX2-RV32-NEXT: or a1, a1, a3 ; LMULMAX2-RV32-NEXT: or a1, a1, a2 @@ -694,13 +694,13 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a2, a1, 8 ; LMULMAX1-RV32-NEXT: lui a3, 16 -; LMULMAX1-RV32-NEXT: addi a3, a3, -256 -; LMULMAX1-RV32-NEXT: and a2, a2, a3 +; LMULMAX1-RV32-NEXT: addi a6, a3, -256 +; LMULMAX1-RV32-NEXT: and a2, a2, a6 ; LMULMAX1-RV32-NEXT: srli a4, a1, 24 ; LMULMAX1-RV32-NEXT: or a2, a2, a4 ; LMULMAX1-RV32-NEXT: slli a4, a1, 8 -; LMULMAX1-RV32-NEXT: lui a6, 4080 -; LMULMAX1-RV32-NEXT: and a4, a4, a6 +; LMULMAX1-RV32-NEXT: lui a5, 4080 +; LMULMAX1-RV32-NEXT: and a4, a4, a5 ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 @@ -709,11 +709,11 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: and a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a2, a2, a6 ; LMULMAX1-RV32-NEXT: srli a4, a1, 24 ; LMULMAX1-RV32-NEXT: or a2, a2, a4 ; LMULMAX1-RV32-NEXT: slli a4, a1, 8 -; LMULMAX1-RV32-NEXT: and a4, a4, a6 +; LMULMAX1-RV32-NEXT: and a4, a4, a5 ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a4 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 @@ -722,23 +722,23 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a1 ; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV32-NEXT: srli a4, a2, 8 -; LMULMAX1-RV32-NEXT: and a4, a4, a3 -; LMULMAX1-RV32-NEXT: srli a5, a2, 24 -; LMULMAX1-RV32-NEXT: or a4, a4, a5 -; LMULMAX1-RV32-NEXT: slli a5, a2, 8 -; LMULMAX1-RV32-NEXT: and a5, a5, a6 +; LMULMAX1-RV32-NEXT: and a4, a4, a6 +; LMULMAX1-RV32-NEXT: srli a3, a2, 24 +; LMULMAX1-RV32-NEXT: or a3, a4, a3 +; LMULMAX1-RV32-NEXT: slli a4, a2, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a5 ; LMULMAX1-RV32-NEXT: slli a2, a2, 24 -; LMULMAX1-RV32-NEXT: or a2, a2, a5 ; LMULMAX1-RV32-NEXT: or a2, a2, a4 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 ; LMULMAX1-RV32-NEXT: sw a2, 16(sp) ; LMULMAX1-RV32-NEXT: vsrl.vx v25, v26, a1 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: and a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a2, a2, a6 ; LMULMAX1-RV32-NEXT: srli a3, a1, 24 ; LMULMAX1-RV32-NEXT: or a2, a2, a3 ; LMULMAX1-RV32-NEXT: slli a3, a1, 8 -; LMULMAX1-RV32-NEXT: and a3, a3, a6 +; LMULMAX1-RV32-NEXT: and a3, a3, a5 ; LMULMAX1-RV32-NEXT: slli a1, a1, 24 ; LMULMAX1-RV32-NEXT: or a1, a1, a3 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 @@ -1861,13 +1861,13 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v26 ; LMULMAX2-RV32-NEXT: srli a2, a3, 8 ; LMULMAX2-RV32-NEXT: lui a1, 16 -; LMULMAX2-RV32-NEXT: addi a1, a1, -256 -; LMULMAX2-RV32-NEXT: and a2, a2, a1 +; LMULMAX2-RV32-NEXT: addi a6, a1, -256 +; LMULMAX2-RV32-NEXT: and a2, a2, a6 ; LMULMAX2-RV32-NEXT: srli a4, a3, 24 ; LMULMAX2-RV32-NEXT: or a4, a2, a4 ; LMULMAX2-RV32-NEXT: slli a5, a3, 8 -; LMULMAX2-RV32-NEXT: lui a6, 4080 -; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: lui a2, 4080 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 @@ -1876,11 +1876,11 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 ; LMULMAX2-RV32-NEXT: srli a5, a3, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 ; LMULMAX2-RV32-NEXT: slli a5, a3, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 @@ -1888,11 +1888,11 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vslidedown.vi v30, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v30 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 ; LMULMAX2-RV32-NEXT: srli a5, a3, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 ; LMULMAX2-RV32-NEXT: slli a5, a3, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 @@ -1900,11 +1900,11 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vslidedown.vi v8, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a3, v8 ; LMULMAX2-RV32-NEXT: srli a4, a3, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 ; LMULMAX2-RV32-NEXT: srli a5, a3, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 ; LMULMAX2-RV32-NEXT: slli a5, a3, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 ; LMULMAX2-RV32-NEXT: slli a3, a3, 24 ; LMULMAX2-RV32-NEXT: or a3, a3, a5 ; LMULMAX2-RV32-NEXT: or a3, a3, a4 @@ -1913,50 +1913,50 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v26, a3 ; LMULMAX2-RV32-NEXT: vmv.x.s a4, v26 ; LMULMAX2-RV32-NEXT: srli a5, a4, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a1 -; LMULMAX2-RV32-NEXT: srli a2, a4, 24 -; LMULMAX2-RV32-NEXT: or a2, a5, a2 -; LMULMAX2-RV32-NEXT: slli a5, a4, 8 ; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: srli a1, a4, 24 +; LMULMAX2-RV32-NEXT: or a1, a5, a1 +; LMULMAX2-RV32-NEXT: slli a5, a4, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 ; LMULMAX2-RV32-NEXT: slli a4, a4, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 -; LMULMAX2-RV32-NEXT: or a2, a4, a2 -; LMULMAX2-RV32-NEXT: sw a2, 32(sp) +; LMULMAX2-RV32-NEXT: or a1, a4, a1 +; LMULMAX2-RV32-NEXT: sw a1, 32(sp) ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v28, a3 -; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV32-NEXT: srli a4, a2, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a1 -; LMULMAX2-RV32-NEXT: srli a5, a2, 24 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: srli a4, a1, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: srli a5, a1, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 -; LMULMAX2-RV32-NEXT: slli a5, a2, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a6 -; LMULMAX2-RV32-NEXT: slli a2, a2, 24 -; LMULMAX2-RV32-NEXT: or a2, a2, a5 -; LMULMAX2-RV32-NEXT: or a2, a2, a4 -; LMULMAX2-RV32-NEXT: sw a2, 56(sp) +; LMULMAX2-RV32-NEXT: slli a5, a1, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: slli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: or a1, a1, a4 +; LMULMAX2-RV32-NEXT: sw a1, 56(sp) ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v30, a3 -; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV32-NEXT: srli a4, a2, 8 -; LMULMAX2-RV32-NEXT: and a4, a4, a1 -; LMULMAX2-RV32-NEXT: srli a5, a2, 24 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: srli a4, a1, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: srli a5, a1, 24 ; LMULMAX2-RV32-NEXT: or a4, a4, a5 -; LMULMAX2-RV32-NEXT: slli a5, a2, 8 -; LMULMAX2-RV32-NEXT: and a5, a5, a6 -; LMULMAX2-RV32-NEXT: slli a2, a2, 24 -; LMULMAX2-RV32-NEXT: or a2, a2, a5 -; LMULMAX2-RV32-NEXT: or a2, a2, a4 -; LMULMAX2-RV32-NEXT: sw a2, 48(sp) +; LMULMAX2-RV32-NEXT: slli a5, a1, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: slli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: or a1, a1, a4 +; LMULMAX2-RV32-NEXT: sw a1, 48(sp) ; LMULMAX2-RV32-NEXT: vsrl.vx v26, v8, a3 -; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV32-NEXT: srli a3, a2, 8 -; LMULMAX2-RV32-NEXT: and a1, a3, a1 -; LMULMAX2-RV32-NEXT: srli a3, a2, 24 -; LMULMAX2-RV32-NEXT: or a1, a1, a3 -; LMULMAX2-RV32-NEXT: slli a3, a2, 8 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: srli a3, a1, 8 ; LMULMAX2-RV32-NEXT: and a3, a3, a6 -; LMULMAX2-RV32-NEXT: slli a2, a2, 24 -; LMULMAX2-RV32-NEXT: or a2, a2, a3 -; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a4, a1, 24 +; LMULMAX2-RV32-NEXT: or a3, a3, a4 +; LMULMAX2-RV32-NEXT: slli a4, a1, 8 +; LMULMAX2-RV32-NEXT: and a2, a4, a2 +; LMULMAX2-RV32-NEXT: slli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: or a1, a1, a3 ; LMULMAX2-RV32-NEXT: sw a1, 40(sp) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; LMULMAX2-RV32-NEXT: addi a1, sp, 32 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll index b5dbfb8212892..e495cdedcaca6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -2252,8 +2252,8 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV64-NEXT: vle16.v v25, (a0) ; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX2-RV64-NEXT: lui a1, 16 -; LMULMAX2-RV64-NEXT: addiw a6, a1, -1 -; LMULMAX2-RV64-NEXT: and a2, a2, a6 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -1 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 ; LMULMAX2-RV64-NEXT: srli a3, a2, 1 ; LMULMAX2-RV64-NEXT: or a2, a2, a3 ; LMULMAX2-RV64-NEXT: srli a3, a2, 2 @@ -2275,8 +2275,8 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 ; LMULMAX2-RV64-NEXT: addi a2, a2, 1365 ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 -; LMULMAX2-RV64-NEXT: addi a7, a2, 1365 -; LMULMAX2-RV64-NEXT: and a4, a4, a7 +; LMULMAX2-RV64-NEXT: addi a6, a2, 1365 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 ; LMULMAX2-RV64-NEXT: sub a4, a3, a4 ; LMULMAX2-RV64-NEXT: lui a3, 13107 ; LMULMAX2-RV64-NEXT: addiw a3, a3, 819 @@ -2299,202 +2299,202 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 ; LMULMAX2-RV64-NEXT: addi a4, a4, 241 ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 -; LMULMAX2-RV64-NEXT: addi a4, a4, -241 -; LMULMAX2-RV64-NEXT: and a1, a5, a4 +; LMULMAX2-RV64-NEXT: addi a7, a4, -241 +; LMULMAX2-RV64-NEXT: and a2, a5, a7 ; LMULMAX2-RV64-NEXT: lui a5, 4112 ; LMULMAX2-RV64-NEXT: addiw a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 16(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 16(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 7 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 30(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 30(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 6 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 28(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 5 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 26(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 26(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 4 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 24(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 22(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 22(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 20(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 20(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25 +; LMULMAX2-RV64-NEXT: and a1, a2, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 ; LMULMAX2-RV64-NEXT: srli a2, a1, 2 @@ -2509,7 +2509,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV64-NEXT: or a1, a1, a2 ; LMULMAX2-RV64-NEXT: not a1, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: and a2, a2, a6 ; LMULMAX2-RV64-NEXT: sub a1, a1, a2 ; LMULMAX2-RV64-NEXT: and a2, a1, a3 ; LMULMAX2-RV64-NEXT: srli a1, a1, 2 @@ -2517,7 +2517,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX2-RV64-NEXT: add a1, a2, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 4 ; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: and a1, a1, a7 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 @@ -2784,8 +2784,8 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV64-NEXT: lui a1, 16 -; LMULMAX1-RV64-NEXT: addiw a6, a1, -1 -; LMULMAX1-RV64-NEXT: and a2, a2, a6 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -1 +; LMULMAX1-RV64-NEXT: and a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: or a2, a2, a3 ; LMULMAX1-RV64-NEXT: srli a3, a2, 2 @@ -2807,8 +2807,8 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX1-RV64-NEXT: slli a2, a2, 12 ; LMULMAX1-RV64-NEXT: addi a2, a2, 1365 ; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a7, a2, 1365 -; LMULMAX1-RV64-NEXT: and a4, a4, a7 +; LMULMAX1-RV64-NEXT: addi a6, a2, 1365 +; LMULMAX1-RV64-NEXT: and a4, a4, a6 ; LMULMAX1-RV64-NEXT: sub a4, a3, a4 ; LMULMAX1-RV64-NEXT: lui a3, 13107 ; LMULMAX1-RV64-NEXT: addiw a3, a3, 819 @@ -2831,22 +2831,202 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 ; LMULMAX1-RV64-NEXT: addi a4, a4, 241 ; LMULMAX1-RV64-NEXT: slli a4, a4, 12 -; LMULMAX1-RV64-NEXT: addi a4, a4, -241 -; LMULMAX1-RV64-NEXT: and a1, a5, a4 +; LMULMAX1-RV64-NEXT: addi a7, a4, -241 +; LMULMAX1-RV64-NEXT: and a2, a5, a7 ; LMULMAX1-RV64-NEXT: lui a5, 4112 ; LMULMAX1-RV64-NEXT: addiw a5, a5, 257 ; LMULMAX1-RV64-NEXT: slli a5, a5, 16 ; LMULMAX1-RV64-NEXT: addi a5, a5, 257 ; LMULMAX1-RV64-NEXT: slli a5, a5, 16 ; LMULMAX1-RV64-NEXT: addi a5, a5, 257 -; LMULMAX1-RV64-NEXT: mul a1, a1, a5 -; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 16(sp) +; LMULMAX1-RV64-NEXT: mul a2, a2, a5 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addi a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV64-NEXT: and a1, a1, a6 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: and a4, a4, a6 +; LMULMAX1-RV64-NEXT: sub a2, a2, a4 +; LMULMAX1-RV64-NEXT: and a4, a2, a3 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: add a2, a4, a2 +; LMULMAX1-RV64-NEXT: srli a4, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a4 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: mul a2, a2, a5 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addi a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 30(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: and a4, a4, a6 +; LMULMAX1-RV64-NEXT: sub a2, a2, a4 +; LMULMAX1-RV64-NEXT: and a4, a2, a3 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: add a2, a4, a2 +; LMULMAX1-RV64-NEXT: srli a4, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a4 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: mul a2, a2, a5 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addi a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 28(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: and a4, a4, a6 +; LMULMAX1-RV64-NEXT: sub a2, a2, a4 +; LMULMAX1-RV64-NEXT: and a4, a2, a3 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: add a2, a4, a2 +; LMULMAX1-RV64-NEXT: srli a4, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a4 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: mul a2, a2, a5 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addi a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 26(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: and a4, a4, a6 +; LMULMAX1-RV64-NEXT: sub a2, a2, a4 +; LMULMAX1-RV64-NEXT: and a4, a2, a3 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: add a2, a4, a2 +; LMULMAX1-RV64-NEXT: srli a4, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a4 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: mul a2, a2, a5 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addi a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 24(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: and a4, a4, a6 +; LMULMAX1-RV64-NEXT: sub a2, a2, a4 +; LMULMAX1-RV64-NEXT: and a4, a2, a3 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: add a2, a4, a2 +; LMULMAX1-RV64-NEXT: srli a4, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a4 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: mul a2, a2, a5 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addi a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 22(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: and a2, a2, a1 +; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 2 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 4 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 8 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 16 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a2, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: and a4, a4, a6 +; LMULMAX1-RV64-NEXT: sub a2, a2, a4 +; LMULMAX1-RV64-NEXT: and a4, a2, a3 +; LMULMAX1-RV64-NEXT: srli a2, a2, 2 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: add a2, a4, a2 +; LMULMAX1-RV64-NEXT: srli a4, a2, 4 +; LMULMAX1-RV64-NEXT: add a2, a2, a4 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: mul a2, a2, a5 +; LMULMAX1-RV64-NEXT: srli a2, a2, 56 +; LMULMAX1-RV64-NEXT: addi a2, a2, -48 +; LMULMAX1-RV64-NEXT: sh a2, 20(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 +; LMULMAX1-RV64-NEXT: and a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a1, 1 ; LMULMAX1-RV64-NEXT: or a1, a1, a2 ; LMULMAX1-RV64-NEXT: srli a2, a1, 2 @@ -2861,7 +3041,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX1-RV64-NEXT: or a1, a1, a2 ; LMULMAX1-RV64-NEXT: not a1, a1 ; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: and a2, a2, a6 ; LMULMAX1-RV64-NEXT: sub a1, a1, a2 ; LMULMAX1-RV64-NEXT: and a2, a1, a3 ; LMULMAX1-RV64-NEXT: srli a1, a1, 2 @@ -2869,187 +3049,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) { ; LMULMAX1-RV64-NEXT: add a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a1, 4 ; LMULMAX1-RV64-NEXT: add a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a1, a1, a4 -; LMULMAX1-RV64-NEXT: mul a1, a1, a5 -; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 30(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV64-NEXT: and a1, a1, a6 -; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 2 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 4 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 8 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 16 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 32 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: not a1, a1 -; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: sub a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a2, a1, a3 -; LMULMAX1-RV64-NEXT: srli a1, a1, 2 -; LMULMAX1-RV64-NEXT: and a1, a1, a3 -; LMULMAX1-RV64-NEXT: add a1, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a1, 4 -; LMULMAX1-RV64-NEXT: add a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a1, a1, a4 -; LMULMAX1-RV64-NEXT: mul a1, a1, a5 -; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 28(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV64-NEXT: and a1, a1, a6 -; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 2 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 4 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 8 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 16 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 32 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: not a1, a1 -; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: sub a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a2, a1, a3 -; LMULMAX1-RV64-NEXT: srli a1, a1, 2 -; LMULMAX1-RV64-NEXT: and a1, a1, a3 -; LMULMAX1-RV64-NEXT: add a1, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a1, 4 -; LMULMAX1-RV64-NEXT: add a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a1, a1, a4 -; LMULMAX1-RV64-NEXT: mul a1, a1, a5 -; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 26(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV64-NEXT: and a1, a1, a6 -; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 2 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 4 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 8 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 16 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 32 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: not a1, a1 -; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: sub a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a2, a1, a3 -; LMULMAX1-RV64-NEXT: srli a1, a1, 2 -; LMULMAX1-RV64-NEXT: and a1, a1, a3 -; LMULMAX1-RV64-NEXT: add a1, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a1, 4 -; LMULMAX1-RV64-NEXT: add a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a1, a1, a4 -; LMULMAX1-RV64-NEXT: mul a1, a1, a5 -; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 24(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV64-NEXT: and a1, a1, a6 -; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 2 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 4 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 8 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 16 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 32 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: not a1, a1 -; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: sub a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a2, a1, a3 -; LMULMAX1-RV64-NEXT: srli a1, a1, 2 -; LMULMAX1-RV64-NEXT: and a1, a1, a3 -; LMULMAX1-RV64-NEXT: add a1, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a1, 4 -; LMULMAX1-RV64-NEXT: add a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a1, a1, a4 -; LMULMAX1-RV64-NEXT: mul a1, a1, a5 -; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 22(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV64-NEXT: and a1, a1, a6 -; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 2 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 4 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 8 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 16 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 32 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: not a1, a1 -; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: sub a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a2, a1, a3 -; LMULMAX1-RV64-NEXT: srli a1, a1, 2 -; LMULMAX1-RV64-NEXT: and a1, a1, a3 -; LMULMAX1-RV64-NEXT: add a1, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a1, 4 -; LMULMAX1-RV64-NEXT: add a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a1, a1, a4 -; LMULMAX1-RV64-NEXT: mul a1, a1, a5 -; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: addi a1, a1, -48 -; LMULMAX1-RV64-NEXT: sh a1, 20(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV64-NEXT: and a1, a1, a6 -; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 2 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 4 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 8 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 16 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: srli a2, a1, 32 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: not a1, a1 -; LMULMAX1-RV64-NEXT: srli a2, a1, 1 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: sub a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a2, a1, a3 -; LMULMAX1-RV64-NEXT: srli a1, a1, 2 -; LMULMAX1-RV64-NEXT: and a1, a1, a3 -; LMULMAX1-RV64-NEXT: add a1, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a1, 4 -; LMULMAX1-RV64-NEXT: add a1, a1, a2 -; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: and a1, a1, a7 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -48 @@ -3669,126 +3669,126 @@ define void @ctlz_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV32-NEXT: vle64.v v25, (a0) ; LMULMAX2-RV32-NEXT: sw zero, 28(sp) ; LMULMAX2-RV32-NEXT: sw zero, 20(sp) -; LMULMAX2-RV32-NEXT: addi a6, zero, 32 +; LMULMAX2-RV32-NEXT: addi a5, zero, 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a6 -; LMULMAX2-RV32-NEXT: vmv.x.s a5, v26 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a4, a1, 1365 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a3, a1, 819 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a7, a1, -241 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a2, a1, 257 -; LMULMAX2-RV32-NEXT: bnez a5, .LBB3_2 +; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a5 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: lui a2, 349525 +; LMULMAX2-RV32-NEXT: addi a4, a2, 1365 +; LMULMAX2-RV32-NEXT: lui a2, 209715 +; LMULMAX2-RV32-NEXT: addi a3, a2, 819 +; LMULMAX2-RV32-NEXT: lui a2, 61681 +; LMULMAX2-RV32-NEXT: addi a6, a2, -241 +; LMULMAX2-RV32-NEXT: lui a2, 4112 +; LMULMAX2-RV32-NEXT: addi a7, a2, 257 +; LMULMAX2-RV32-NEXT: bnez a1, .LBB3_2 ; LMULMAX2-RV32-NEXT: # %bb.1: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: and a5, a5, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a5, a1, a3 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: and a2, a2, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a5, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a1, a1, a7 -; LMULMAX2-RV32-NEXT: mul a1, a1, a2 +; LMULMAX2-RV32-NEXT: add a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a6 +; LMULMAX2-RV32-NEXT: mul a1, a1, a7 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: addi a5, a1, 32 +; LMULMAX2-RV32-NEXT: addi a1, a1, 32 ; LMULMAX2-RV32-NEXT: j .LBB3_3 ; LMULMAX2-RV32-NEXT: .LBB3_2: -; LMULMAX2-RV32-NEXT: srli a1, a5, 1 -; LMULMAX2-RV32-NEXT: or a1, a5, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: and a5, a5, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a5, a1, a3 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: and a2, a2, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a5, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a1, a1, a7 -; LMULMAX2-RV32-NEXT: mul a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a5, a1, 24 +; LMULMAX2-RV32-NEXT: add a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a6 +; LMULMAX2-RV32-NEXT: mul a1, a1, a7 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB3_3: ; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a6 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sw a5, 16(sp) -; LMULMAX2-RV32-NEXT: bnez a1, .LBB3_5 +; LMULMAX2-RV32-NEXT: vsrl.vx v26, v25, a5 +; LMULMAX2-RV32-NEXT: vmv.x.s a5, v26 +; LMULMAX2-RV32-NEXT: sw a1, 16(sp) +; LMULMAX2-RV32-NEXT: bnez a5, .LBB3_5 ; LMULMAX2-RV32-NEXT: # %bb.4: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: and a4, a5, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a4 -; LMULMAX2-RV32-NEXT: and a4, a1, a3 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: and a2, a2, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a4, a1 -; LMULMAX2-RV32-NEXT: srli a3, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a3 -; LMULMAX2-RV32-NEXT: and a1, a1, a7 -; LMULMAX2-RV32-NEXT: mul a1, a1, a2 +; LMULMAX2-RV32-NEXT: add a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a6 +; LMULMAX2-RV32-NEXT: mul a1, a1, a7 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, 32 ; LMULMAX2-RV32-NEXT: j .LBB3_6 ; LMULMAX2-RV32-NEXT: .LBB3_5: -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a1, a5, 1 +; LMULMAX2-RV32-NEXT: or a1, a5, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: and a4, a5, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a4 -; LMULMAX2-RV32-NEXT: and a4, a1, a3 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: and a2, a2, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a4, a1 -; LMULMAX2-RV32-NEXT: srli a3, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a3 -; LMULMAX2-RV32-NEXT: and a1, a1, a7 -; LMULMAX2-RV32-NEXT: mul a1, a1, a2 +; LMULMAX2-RV32-NEXT: add a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a6 +; LMULMAX2-RV32-NEXT: mul a1, a1, a7 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB3_6: ; LMULMAX2-RV32-NEXT: sw a1, 24(sp) @@ -3904,126 +3904,126 @@ define void @ctlz_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV32-NEXT: sw zero, 28(sp) ; LMULMAX1-RV32-NEXT: sw zero, 20(sp) -; LMULMAX1-RV32-NEXT: addi a6, zero, 32 +; LMULMAX1-RV32-NEXT: addi a5, zero, 32 ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a6 -; LMULMAX1-RV32-NEXT: vmv.x.s a5, v26 -; LMULMAX1-RV32-NEXT: lui a1, 349525 -; LMULMAX1-RV32-NEXT: addi a4, a1, 1365 -; LMULMAX1-RV32-NEXT: lui a1, 209715 -; LMULMAX1-RV32-NEXT: addi a3, a1, 819 -; LMULMAX1-RV32-NEXT: lui a1, 61681 -; LMULMAX1-RV32-NEXT: addi a7, a1, -241 -; LMULMAX1-RV32-NEXT: lui a1, 4112 -; LMULMAX1-RV32-NEXT: addi a2, a1, 257 -; LMULMAX1-RV32-NEXT: bnez a5, .LBB3_2 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a5 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: lui a2, 349525 +; LMULMAX1-RV32-NEXT: addi a4, a2, 1365 +; LMULMAX1-RV32-NEXT: lui a2, 209715 +; LMULMAX1-RV32-NEXT: addi a3, a2, 819 +; LMULMAX1-RV32-NEXT: lui a2, 61681 +; LMULMAX1-RV32-NEXT: addi a6, a2, -241 +; LMULMAX1-RV32-NEXT: lui a2, 4112 +; LMULMAX1-RV32-NEXT: addi a7, a2, 257 +; LMULMAX1-RV32-NEXT: bnez a1, .LBB3_2 ; LMULMAX1-RV32-NEXT: # %bb.1: ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV32-NEXT: srli a5, a1, 1 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 ; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a5, a1, 1 -; LMULMAX1-RV32-NEXT: and a5, a5, a4 -; LMULMAX1-RV32-NEXT: sub a1, a1, a5 -; LMULMAX1-RV32-NEXT: and a5, a1, a3 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: and a2, a2, a4 +; LMULMAX1-RV32-NEXT: sub a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a2, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 2 ; LMULMAX1-RV32-NEXT: and a1, a1, a3 -; LMULMAX1-RV32-NEXT: add a1, a5, a1 -; LMULMAX1-RV32-NEXT: srli a5, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a5 -; LMULMAX1-RV32-NEXT: and a1, a1, a7 -; LMULMAX1-RV32-NEXT: mul a1, a1, a2 +; LMULMAX1-RV32-NEXT: add a1, a2, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a1, a1, a6 +; LMULMAX1-RV32-NEXT: mul a1, a1, a7 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: addi a5, a1, 32 +; LMULMAX1-RV32-NEXT: addi a1, a1, 32 ; LMULMAX1-RV32-NEXT: j .LBB3_3 ; LMULMAX1-RV32-NEXT: .LBB3_2: -; LMULMAX1-RV32-NEXT: srli a1, a5, 1 -; LMULMAX1-RV32-NEXT: or a1, a5, a1 -; LMULMAX1-RV32-NEXT: srli a5, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 ; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a5, a1, 1 -; LMULMAX1-RV32-NEXT: and a5, a5, a4 -; LMULMAX1-RV32-NEXT: sub a1, a1, a5 -; LMULMAX1-RV32-NEXT: and a5, a1, a3 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: and a2, a2, a4 +; LMULMAX1-RV32-NEXT: sub a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a2, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 2 ; LMULMAX1-RV32-NEXT: and a1, a1, a3 -; LMULMAX1-RV32-NEXT: add a1, a5, a1 -; LMULMAX1-RV32-NEXT: srli a5, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a5 -; LMULMAX1-RV32-NEXT: and a1, a1, a7 -; LMULMAX1-RV32-NEXT: mul a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a5, a1, 24 +; LMULMAX1-RV32-NEXT: add a1, a2, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a1, a1, a6 +; LMULMAX1-RV32-NEXT: mul a1, a1, a7 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB3_3: ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a6 -; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV32-NEXT: sw a5, 16(sp) -; LMULMAX1-RV32-NEXT: bnez a1, .LBB3_5 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a5 +; LMULMAX1-RV32-NEXT: vmv.x.s a5, v26 +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) +; LMULMAX1-RV32-NEXT: bnez a5, .LBB3_5 ; LMULMAX1-RV32-NEXT: # %bb.4: ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV32-NEXT: srli a5, a1, 1 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 ; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a5, a1, 1 -; LMULMAX1-RV32-NEXT: and a4, a5, a4 -; LMULMAX1-RV32-NEXT: sub a1, a1, a4 -; LMULMAX1-RV32-NEXT: and a4, a1, a3 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: and a2, a2, a4 +; LMULMAX1-RV32-NEXT: sub a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a2, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 2 ; LMULMAX1-RV32-NEXT: and a1, a1, a3 -; LMULMAX1-RV32-NEXT: add a1, a4, a1 -; LMULMAX1-RV32-NEXT: srli a3, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a3 -; LMULMAX1-RV32-NEXT: and a1, a1, a7 -; LMULMAX1-RV32-NEXT: mul a1, a1, a2 +; LMULMAX1-RV32-NEXT: add a1, a2, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a1, a1, a6 +; LMULMAX1-RV32-NEXT: mul a1, a1, a7 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, 32 ; LMULMAX1-RV32-NEXT: j .LBB3_6 ; LMULMAX1-RV32-NEXT: .LBB3_5: -; LMULMAX1-RV32-NEXT: srli a5, a1, 1 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 -; LMULMAX1-RV32-NEXT: srli a5, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: srli a1, a5, 1 +; LMULMAX1-RV32-NEXT: or a1, a5, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 2 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: srli a2, a1, 16 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 ; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a5, a1, 1 -; LMULMAX1-RV32-NEXT: and a4, a5, a4 -; LMULMAX1-RV32-NEXT: sub a1, a1, a4 -; LMULMAX1-RV32-NEXT: and a4, a1, a3 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: and a2, a2, a4 +; LMULMAX1-RV32-NEXT: sub a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a2, a1, a3 ; LMULMAX1-RV32-NEXT: srli a1, a1, 2 ; LMULMAX1-RV32-NEXT: and a1, a1, a3 -; LMULMAX1-RV32-NEXT: add a1, a4, a1 -; LMULMAX1-RV32-NEXT: srli a3, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a3 -; LMULMAX1-RV32-NEXT: and a1, a1, a7 -; LMULMAX1-RV32-NEXT: mul a1, a1, a2 +; LMULMAX1-RV32-NEXT: add a1, a2, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 4 +; LMULMAX1-RV32-NEXT: add a1, a1, a2 +; LMULMAX1-RV32-NEXT: and a1, a1, a6 +; LMULMAX1-RV32-NEXT: mul a1, a1, a7 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB3_6: ; LMULMAX1-RV32-NEXT: sw a1, 24(sp) @@ -8513,8 +8513,8 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: vle16.v v26, (a0) ; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX2-RV64-NEXT: lui a1, 16 -; LMULMAX2-RV64-NEXT: addiw a6, a1, -1 -; LMULMAX2-RV64-NEXT: and a2, a2, a6 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -1 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 ; LMULMAX2-RV64-NEXT: srli a3, a2, 1 ; LMULMAX2-RV64-NEXT: or a2, a2, a3 ; LMULMAX2-RV64-NEXT: srli a3, a2, 2 @@ -8536,8 +8536,8 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 ; LMULMAX2-RV64-NEXT: addi a2, a2, 1365 ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 -; LMULMAX2-RV64-NEXT: addi a7, a2, 1365 -; LMULMAX2-RV64-NEXT: and a4, a4, a7 +; LMULMAX2-RV64-NEXT: addi a6, a2, 1365 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 ; LMULMAX2-RV64-NEXT: sub a4, a3, a4 ; LMULMAX2-RV64-NEXT: lui a3, 13107 ; LMULMAX2-RV64-NEXT: addiw a3, a3, 819 @@ -8560,52 +8560,442 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 ; LMULMAX2-RV64-NEXT: addi a4, a4, 241 ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 -; LMULMAX2-RV64-NEXT: addi a4, a4, -241 -; LMULMAX2-RV64-NEXT: and a1, a5, a4 +; LMULMAX2-RV64-NEXT: addi a7, a4, -241 +; LMULMAX2-RV64-NEXT: and a2, a5, a7 ; LMULMAX2-RV64-NEXT: lui a5, 4112 ; LMULMAX2-RV64-NEXT: addiw a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 32(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 32(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 1, e16, m2, ta, mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 62(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 62(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 60(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 58(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 56(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 54(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 52(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 50(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 48(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 46(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 44(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 42(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 40(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 38(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 2 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 8 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 16 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: addi a2, a2, -48 +; LMULMAX2-RV64-NEXT: sh a2, 36(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV64-NEXT: and a1, a2, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 ; LMULMAX2-RV64-NEXT: srli a2, a1, 2 @@ -8620,7 +9010,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: or a1, a1, a2 ; LMULMAX2-RV64-NEXT: not a1, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: and a2, a2, a6 ; LMULMAX2-RV64-NEXT: sub a1, a1, a2 ; LMULMAX2-RV64-NEXT: and a2, a1, a3 ; LMULMAX2-RV64-NEXT: srli a1, a1, 2 @@ -8628,397 +9018,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: add a1, a2, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 4 ; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 60(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 58(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 56(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 54(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 52(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 50(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 48(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 46(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 44(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 42(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 40(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 38(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: addi a1, a1, -48 -; LMULMAX2-RV64-NEXT: sh a1, 36(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: and a1, a1, a6 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 2 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 8 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 16 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 32 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: and a1, a1, a7 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: addi a1, a1, -48 @@ -9518,8 +9518,8 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: lui a2, 16 -; LMULMAX1-RV64-NEXT: addiw a7, a2, -1 -; LMULMAX1-RV64-NEXT: and a1, a1, a7 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -1 +; LMULMAX1-RV64-NEXT: and a1, a1, a2 ; LMULMAX1-RV64-NEXT: srli a3, a1, 1 ; LMULMAX1-RV64-NEXT: or a1, a1, a3 ; LMULMAX1-RV64-NEXT: srli a3, a1, 2 @@ -9541,8 +9541,8 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX1-RV64-NEXT: slli a3, a3, 12 ; LMULMAX1-RV64-NEXT: addi a3, a3, 1365 ; LMULMAX1-RV64-NEXT: slli a3, a3, 12 -; LMULMAX1-RV64-NEXT: addi t0, a3, 1365 -; LMULMAX1-RV64-NEXT: and a4, a4, t0 +; LMULMAX1-RV64-NEXT: addi a7, a3, 1365 +; LMULMAX1-RV64-NEXT: and a4, a4, a7 ; LMULMAX1-RV64-NEXT: sub a1, a1, a4 ; LMULMAX1-RV64-NEXT: lui a4, 13107 ; LMULMAX1-RV64-NEXT: addiw a4, a4, 819 @@ -9565,441 +9565,441 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX1-RV64-NEXT: slli a5, a5, 12 ; LMULMAX1-RV64-NEXT: addi a5, a5, 241 ; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, -241 -; LMULMAX1-RV64-NEXT: and a2, a1, a5 +; LMULMAX1-RV64-NEXT: addi t0, a5, -241 +; LMULMAX1-RV64-NEXT: and a3, a1, t0 ; LMULMAX1-RV64-NEXT: lui a1, 4112 ; LMULMAX1-RV64-NEXT: addiw a1, a1, 257 ; LMULMAX1-RV64-NEXT: slli a1, a1, 16 ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 ; LMULMAX1-RV64-NEXT: slli a1, a1, 16 ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 32(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 32(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16, m1, ta, mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 7 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 46(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 46(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 6 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 44(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 5 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 42(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 42(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 4 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 40(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 38(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 36(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 34(sp) -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 40(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 16(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 38(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a3, a3, t0 +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 36(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a3, a3, t0 +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 34(sp) +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v25 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a3, a3, t0 +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 16(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 30(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 30(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 28(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 26(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 26(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 24(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 22(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 22(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 2 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 8 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 16 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 2 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 8 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 16 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 32 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: addi a2, a2, -48 -; LMULMAX1-RV64-NEXT: sh a2, 20(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: addi a3, a3, -48 +; LMULMAX1-RV64-NEXT: sh a3, 20(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v25 +; LMULMAX1-RV64-NEXT: and a2, a3, a2 ; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: or a2, a2, a3 ; LMULMAX1-RV64-NEXT: srli a3, a2, 2 @@ -10014,7 +10014,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX1-RV64-NEXT: or a2, a2, a3 ; LMULMAX1-RV64-NEXT: not a2, a2 ; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: and a3, a3, t0 +; LMULMAX1-RV64-NEXT: and a3, a3, a7 ; LMULMAX1-RV64-NEXT: sub a2, a2, a3 ; LMULMAX1-RV64-NEXT: and a3, a2, a4 ; LMULMAX1-RV64-NEXT: srli a2, a2, 2 @@ -10022,7 +10022,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX1-RV64-NEXT: add a2, a3, a2 ; LMULMAX1-RV64-NEXT: srli a3, a2, 4 ; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: and a2, a2, t0 ; LMULMAX1-RV64-NEXT: mul a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: addi a1, a1, -48 @@ -11138,240 +11138,240 @@ define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: sw zero, 52(sp) ; LMULMAX2-RV32-NEXT: sw zero, 44(sp) ; LMULMAX2-RV32-NEXT: sw zero, 36(sp) -; LMULMAX2-RV32-NEXT: addi a6, zero, 32 +; LMULMAX2-RV32-NEXT: addi a5, zero, 32 ; LMULMAX2-RV32-NEXT: vsetivli zero, 1, e64, m2, ta, mu -; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a6 -; LMULMAX2-RV32-NEXT: vmv.x.s a5, v28 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a4, a1, 1365 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a3, a1, 819 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a7, a1, -241 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a2, a1, 257 -; LMULMAX2-RV32-NEXT: bnez a5, .LBB7_2 +; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a5 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 +; LMULMAX2-RV32-NEXT: lui a2, 349525 +; LMULMAX2-RV32-NEXT: addi a4, a2, 1365 +; LMULMAX2-RV32-NEXT: lui a2, 209715 +; LMULMAX2-RV32-NEXT: addi a3, a2, 819 +; LMULMAX2-RV32-NEXT: lui a2, 61681 +; LMULMAX2-RV32-NEXT: addi a6, a2, -241 +; LMULMAX2-RV32-NEXT: lui a2, 4112 +; LMULMAX2-RV32-NEXT: addi a7, a2, 257 +; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_2 ; LMULMAX2-RV32-NEXT: # %bb.1: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: and a5, a5, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a5, a1, a3 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: and a2, a2, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a5, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a1, a1, a7 -; LMULMAX2-RV32-NEXT: mul a1, a1, a2 +; LMULMAX2-RV32-NEXT: add a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a6 +; LMULMAX2-RV32-NEXT: mul a1, a1, a7 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: addi a5, a1, 32 +; LMULMAX2-RV32-NEXT: addi a1, a1, 32 ; LMULMAX2-RV32-NEXT: j .LBB7_3 ; LMULMAX2-RV32-NEXT: .LBB7_2: -; LMULMAX2-RV32-NEXT: srli a1, a5, 1 -; LMULMAX2-RV32-NEXT: or a1, a5, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: and a5, a5, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a5, a1, a3 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: and a2, a2, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a5, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a1, a1, a7 -; LMULMAX2-RV32-NEXT: mul a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a5, a1, 24 +; LMULMAX2-RV32-NEXT: add a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a6 +; LMULMAX2-RV32-NEXT: mul a1, a1, a7 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB7_3: ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a6 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v30 -; LMULMAX2-RV32-NEXT: sw a5, 32(sp) -; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_5 +; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a5 +; LMULMAX2-RV32-NEXT: vmv.x.s a2, v30 +; LMULMAX2-RV32-NEXT: sw a1, 32(sp) +; LMULMAX2-RV32-NEXT: bnez a2, .LBB7_5 ; LMULMAX2-RV32-NEXT: # %bb.4: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: and a5, a5, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a5, a1, a3 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: and a2, a2, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a5, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a1, a1, a7 -; LMULMAX2-RV32-NEXT: mul a1, a1, a2 +; LMULMAX2-RV32-NEXT: add a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a6 +; LMULMAX2-RV32-NEXT: mul a1, a1, a7 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: addi a5, a1, 32 +; LMULMAX2-RV32-NEXT: addi a1, a1, 32 ; LMULMAX2-RV32-NEXT: j .LBB7_6 ; LMULMAX2-RV32-NEXT: .LBB7_5: -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a1, a2, 1 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: and a5, a5, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a5, a1, a3 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: and a2, a2, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a5, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a1, a1, a7 -; LMULMAX2-RV32-NEXT: mul a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a5, a1, 24 +; LMULMAX2-RV32-NEXT: add a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a6 +; LMULMAX2-RV32-NEXT: mul a1, a1, a7 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB7_6: ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a6 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v30 -; LMULMAX2-RV32-NEXT: sw a5, 56(sp) -; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_8 +; LMULMAX2-RV32-NEXT: vsrl.vx v30, v28, a5 +; LMULMAX2-RV32-NEXT: vmv.x.s a2, v30 +; LMULMAX2-RV32-NEXT: sw a1, 56(sp) +; LMULMAX2-RV32-NEXT: bnez a2, .LBB7_8 ; LMULMAX2-RV32-NEXT: # %bb.7: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: and a5, a5, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a5, a1, a3 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: and a2, a2, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a5, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a1, a1, a7 -; LMULMAX2-RV32-NEXT: mul a1, a1, a2 +; LMULMAX2-RV32-NEXT: add a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a6 +; LMULMAX2-RV32-NEXT: mul a1, a1, a7 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 -; LMULMAX2-RV32-NEXT: addi a5, a1, 32 +; LMULMAX2-RV32-NEXT: addi a1, a1, 32 ; LMULMAX2-RV32-NEXT: j .LBB7_9 ; LMULMAX2-RV32-NEXT: .LBB7_8: -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a1, a2, 1 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: and a5, a5, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a5, a1, a3 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: and a2, a2, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a5, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a5 -; LMULMAX2-RV32-NEXT: and a1, a1, a7 -; LMULMAX2-RV32-NEXT: mul a1, a1, a2 -; LMULMAX2-RV32-NEXT: srli a5, a1, 24 +; LMULMAX2-RV32-NEXT: add a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a6 +; LMULMAX2-RV32-NEXT: mul a1, a1, a7 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB7_9: ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a6 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sw a5, 48(sp) -; LMULMAX2-RV32-NEXT: bnez a1, .LBB7_11 +; LMULMAX2-RV32-NEXT: vsrl.vx v28, v26, a5 +; LMULMAX2-RV32-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV32-NEXT: sw a1, 48(sp) +; LMULMAX2-RV32-NEXT: bnez a2, .LBB7_11 ; LMULMAX2-RV32-NEXT: # %bb.10: ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: and a4, a5, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a4 -; LMULMAX2-RV32-NEXT: and a4, a1, a3 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: and a2, a2, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a4, a1 -; LMULMAX2-RV32-NEXT: srli a3, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a3 -; LMULMAX2-RV32-NEXT: and a1, a1, a7 -; LMULMAX2-RV32-NEXT: mul a1, a1, a2 +; LMULMAX2-RV32-NEXT: add a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a6 +; LMULMAX2-RV32-NEXT: mul a1, a1, a7 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: addi a1, a1, 32 ; LMULMAX2-RV32-NEXT: j .LBB7_12 ; LMULMAX2-RV32-NEXT: .LBB7_11: -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 2 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 4 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 8 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 -; LMULMAX2-RV32-NEXT: srli a5, a1, 16 -; LMULMAX2-RV32-NEXT: or a1, a1, a5 +; LMULMAX2-RV32-NEXT: srli a1, a2, 1 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 2 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: srli a2, a1, 16 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 ; LMULMAX2-RV32-NEXT: not a1, a1 -; LMULMAX2-RV32-NEXT: srli a5, a1, 1 -; LMULMAX2-RV32-NEXT: and a4, a5, a4 -; LMULMAX2-RV32-NEXT: sub a1, a1, a4 -; LMULMAX2-RV32-NEXT: and a4, a1, a3 +; LMULMAX2-RV32-NEXT: srli a2, a1, 1 +; LMULMAX2-RV32-NEXT: and a2, a2, a4 +; LMULMAX2-RV32-NEXT: sub a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a2, a1, a3 ; LMULMAX2-RV32-NEXT: srli a1, a1, 2 ; LMULMAX2-RV32-NEXT: and a1, a1, a3 -; LMULMAX2-RV32-NEXT: add a1, a4, a1 -; LMULMAX2-RV32-NEXT: srli a3, a1, 4 -; LMULMAX2-RV32-NEXT: add a1, a1, a3 -; LMULMAX2-RV32-NEXT: and a1, a1, a7 -; LMULMAX2-RV32-NEXT: mul a1, a1, a2 +; LMULMAX2-RV32-NEXT: add a1, a2, a1 +; LMULMAX2-RV32-NEXT: srli a2, a1, 4 +; LMULMAX2-RV32-NEXT: add a1, a1, a2 +; LMULMAX2-RV32-NEXT: and a1, a1, a6 +; LMULMAX2-RV32-NEXT: mul a1, a1, a7 ; LMULMAX2-RV32-NEXT: srli a1, a1, 24 ; LMULMAX2-RV32-NEXT: .LBB7_12: ; LMULMAX2-RV32-NEXT: sw a1, 40(sp) @@ -11557,195 +11557,195 @@ define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi a6, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v26, (a6) -; LMULMAX1-RV32-NEXT: sw zero, 44(sp) -; LMULMAX1-RV32-NEXT: sw zero, 36(sp) -; LMULMAX1-RV32-NEXT: addi a7, zero, 32 -; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a7 -; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 -; LMULMAX1-RV32-NEXT: lui a2, 349525 -; LMULMAX1-RV32-NEXT: addi a5, a2, 1365 -; LMULMAX1-RV32-NEXT: lui a2, 209715 -; LMULMAX1-RV32-NEXT: addi a4, a2, 819 -; LMULMAX1-RV32-NEXT: lui a2, 61681 -; LMULMAX1-RV32-NEXT: addi t0, a2, -241 -; LMULMAX1-RV32-NEXT: lui a2, 4112 -; LMULMAX1-RV32-NEXT: addi a3, a2, 257 -; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_2 -; LMULMAX1-RV32-NEXT: # %bb.1: -; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: and a2, a2, a5 -; LMULMAX1-RV32-NEXT: sub a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a2, a1, a4 -; LMULMAX1-RV32-NEXT: srli a1, a1, 2 -; LMULMAX1-RV32-NEXT: and a1, a1, a4 -; LMULMAX1-RV32-NEXT: add a1, a2, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, t0 -; LMULMAX1-RV32-NEXT: mul a1, a1, a3 -; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: addi a1, a1, 32 +; LMULMAX1-RV32-NEXT: addi a6, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a6) +; LMULMAX1-RV32-NEXT: sw zero, 44(sp) +; LMULMAX1-RV32-NEXT: sw zero, 36(sp) +; LMULMAX1-RV32-NEXT: addi a1, zero, 32 +; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a1 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV32-NEXT: lui a3, 349525 +; LMULMAX1-RV32-NEXT: addi a5, a3, 1365 +; LMULMAX1-RV32-NEXT: lui a3, 209715 +; LMULMAX1-RV32-NEXT: addi a4, a3, 819 +; LMULMAX1-RV32-NEXT: lui a3, 61681 +; LMULMAX1-RV32-NEXT: addi a7, a3, -241 +; LMULMAX1-RV32-NEXT: lui a3, 4112 +; LMULMAX1-RV32-NEXT: addi t0, a3, 257 +; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_2 +; LMULMAX1-RV32-NEXT: # %bb.1: +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV32-NEXT: srli a3, a2, 1 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 2 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 4 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 8 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 16 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: not a2, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 1 +; LMULMAX1-RV32-NEXT: and a3, a3, a5 +; LMULMAX1-RV32-NEXT: sub a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a3, a2, a4 +; LMULMAX1-RV32-NEXT: srli a2, a2, 2 +; LMULMAX1-RV32-NEXT: and a2, a2, a4 +; LMULMAX1-RV32-NEXT: add a2, a3, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 4 +; LMULMAX1-RV32-NEXT: add a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a2, a2, a7 +; LMULMAX1-RV32-NEXT: mul a2, a2, t0 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: addi a2, a2, 32 ; LMULMAX1-RV32-NEXT: j .LBB7_3 ; LMULMAX1-RV32-NEXT: .LBB7_2: -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: and a2, a2, a5 -; LMULMAX1-RV32-NEXT: sub a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a2, a1, a4 -; LMULMAX1-RV32-NEXT: srli a1, a1, 2 -; LMULMAX1-RV32-NEXT: and a1, a1, a4 -; LMULMAX1-RV32-NEXT: add a1, a2, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, t0 -; LMULMAX1-RV32-NEXT: mul a1, a1, a3 -; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: srli a3, a2, 1 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 2 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 4 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 8 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 16 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: not a2, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 1 +; LMULMAX1-RV32-NEXT: and a3, a3, a5 +; LMULMAX1-RV32-NEXT: sub a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a3, a2, a4 +; LMULMAX1-RV32-NEXT: srli a2, a2, 2 +; LMULMAX1-RV32-NEXT: and a2, a2, a4 +; LMULMAX1-RV32-NEXT: add a2, a3, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 4 +; LMULMAX1-RV32-NEXT: add a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a2, a2, a7 +; LMULMAX1-RV32-NEXT: mul a2, a2, t0 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: .LBB7_3: ; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a7 -; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 -; LMULMAX1-RV32-NEXT: sw a1, 32(sp) -; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_5 +; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a1 +; LMULMAX1-RV32-NEXT: vmv.x.s a3, v27 +; LMULMAX1-RV32-NEXT: sw a2, 32(sp) +; LMULMAX1-RV32-NEXT: bnez a3, .LBB7_5 ; LMULMAX1-RV32-NEXT: # %bb.4: -; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: and a2, a2, a5 -; LMULMAX1-RV32-NEXT: sub a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a2, a1, a4 -; LMULMAX1-RV32-NEXT: srli a1, a1, 2 -; LMULMAX1-RV32-NEXT: and a1, a1, a4 -; LMULMAX1-RV32-NEXT: add a1, a2, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, t0 -; LMULMAX1-RV32-NEXT: mul a1, a1, a3 -; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: addi a1, a1, 32 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV32-NEXT: srli a3, a2, 1 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 2 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 4 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 8 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 16 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: not a2, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 1 +; LMULMAX1-RV32-NEXT: and a3, a3, a5 +; LMULMAX1-RV32-NEXT: sub a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a3, a2, a4 +; LMULMAX1-RV32-NEXT: srli a2, a2, 2 +; LMULMAX1-RV32-NEXT: and a2, a2, a4 +; LMULMAX1-RV32-NEXT: add a2, a3, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 4 +; LMULMAX1-RV32-NEXT: add a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a2, a2, a7 +; LMULMAX1-RV32-NEXT: mul a2, a2, t0 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: addi a2, a2, 32 ; LMULMAX1-RV32-NEXT: j .LBB7_6 ; LMULMAX1-RV32-NEXT: .LBB7_5: -; LMULMAX1-RV32-NEXT: srli a1, a2, 1 -; LMULMAX1-RV32-NEXT: or a1, a2, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: and a2, a2, a5 -; LMULMAX1-RV32-NEXT: sub a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a2, a1, a4 -; LMULMAX1-RV32-NEXT: srli a1, a1, 2 -; LMULMAX1-RV32-NEXT: and a1, a1, a4 -; LMULMAX1-RV32-NEXT: add a1, a2, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, t0 -; LMULMAX1-RV32-NEXT: mul a1, a1, a3 -; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: srli a2, a3, 1 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 2 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 4 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 8 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 16 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: not a2, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 1 +; LMULMAX1-RV32-NEXT: and a3, a3, a5 +; LMULMAX1-RV32-NEXT: sub a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a3, a2, a4 +; LMULMAX1-RV32-NEXT: srli a2, a2, 2 +; LMULMAX1-RV32-NEXT: and a2, a2, a4 +; LMULMAX1-RV32-NEXT: add a2, a3, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 4 +; LMULMAX1-RV32-NEXT: add a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a2, a2, a7 +; LMULMAX1-RV32-NEXT: mul a2, a2, t0 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: .LBB7_6: -; LMULMAX1-RV32-NEXT: sw a1, 40(sp) +; LMULMAX1-RV32-NEXT: sw a2, 40(sp) ; LMULMAX1-RV32-NEXT: sw zero, 28(sp) -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a7 -; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a1 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV32-NEXT: sw zero, 20(sp) -; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_8 +; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_8 ; LMULMAX1-RV32-NEXT: # %bb.7: -; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: and a2, a2, a5 -; LMULMAX1-RV32-NEXT: sub a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a2, a1, a4 -; LMULMAX1-RV32-NEXT: srli a1, a1, 2 -; LMULMAX1-RV32-NEXT: and a1, a1, a4 -; LMULMAX1-RV32-NEXT: add a1, a2, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, t0 -; LMULMAX1-RV32-NEXT: mul a1, a1, a3 -; LMULMAX1-RV32-NEXT: srli a1, a1, 24 -; LMULMAX1-RV32-NEXT: addi a1, a1, 32 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 +; LMULMAX1-RV32-NEXT: srli a3, a2, 1 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 2 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 4 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 8 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 16 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: not a2, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 1 +; LMULMAX1-RV32-NEXT: and a3, a3, a5 +; LMULMAX1-RV32-NEXT: sub a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a3, a2, a4 +; LMULMAX1-RV32-NEXT: srli a2, a2, 2 +; LMULMAX1-RV32-NEXT: and a2, a2, a4 +; LMULMAX1-RV32-NEXT: add a2, a3, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 4 +; LMULMAX1-RV32-NEXT: add a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a2, a2, a7 +; LMULMAX1-RV32-NEXT: mul a2, a2, t0 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: addi a2, a2, 32 ; LMULMAX1-RV32-NEXT: j .LBB7_9 ; LMULMAX1-RV32-NEXT: .LBB7_8: -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 2 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 8 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: srli a2, a1, 16 -; LMULMAX1-RV32-NEXT: or a1, a1, a2 -; LMULMAX1-RV32-NEXT: not a1, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 1 -; LMULMAX1-RV32-NEXT: and a2, a2, a5 -; LMULMAX1-RV32-NEXT: sub a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a2, a1, a4 -; LMULMAX1-RV32-NEXT: srli a1, a1, 2 -; LMULMAX1-RV32-NEXT: and a1, a1, a4 -; LMULMAX1-RV32-NEXT: add a1, a2, a1 -; LMULMAX1-RV32-NEXT: srli a2, a1, 4 -; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, t0 -; LMULMAX1-RV32-NEXT: mul a1, a1, a3 -; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: srli a3, a2, 1 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 2 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 4 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 8 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a2, 16 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: not a2, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 1 +; LMULMAX1-RV32-NEXT: and a3, a3, a5 +; LMULMAX1-RV32-NEXT: sub a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a3, a2, a4 +; LMULMAX1-RV32-NEXT: srli a2, a2, 2 +; LMULMAX1-RV32-NEXT: and a2, a2, a4 +; LMULMAX1-RV32-NEXT: add a2, a3, a2 +; LMULMAX1-RV32-NEXT: srli a3, a2, 4 +; LMULMAX1-RV32-NEXT: add a2, a2, a3 +; LMULMAX1-RV32-NEXT: and a2, a2, a7 +; LMULMAX1-RV32-NEXT: mul a2, a2, t0 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 ; LMULMAX1-RV32-NEXT: .LBB7_9: ; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a7 -; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV32-NEXT: sw a1, 16(sp) -; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_11 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a1 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: sw a2, 16(sp) +; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_11 ; LMULMAX1-RV32-NEXT: # %bb.10: ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: srli a2, a1, 1 @@ -11768,14 +11768,14 @@ define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: add a1, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a1, 4 ; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, t0 -; LMULMAX1-RV32-NEXT: mul a1, a1, a3 +; LMULMAX1-RV32-NEXT: and a1, a1, a7 +; LMULMAX1-RV32-NEXT: mul a1, a1, t0 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: addi a1, a1, 32 ; LMULMAX1-RV32-NEXT: j .LBB7_12 ; LMULMAX1-RV32-NEXT: .LBB7_11: -; LMULMAX1-RV32-NEXT: srli a1, a2, 1 -; LMULMAX1-RV32-NEXT: or a1, a2, a1 +; LMULMAX1-RV32-NEXT: srli a2, a1, 1 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 ; LMULMAX1-RV32-NEXT: srli a2, a1, 2 ; LMULMAX1-RV32-NEXT: or a1, a1, a2 ; LMULMAX1-RV32-NEXT: srli a2, a1, 4 @@ -11794,8 +11794,8 @@ define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: add a1, a2, a1 ; LMULMAX1-RV32-NEXT: srli a2, a1, 4 ; LMULMAX1-RV32-NEXT: add a1, a1, a2 -; LMULMAX1-RV32-NEXT: and a1, a1, t0 -; LMULMAX1-RV32-NEXT: mul a1, a1, a3 +; LMULMAX1-RV32-NEXT: and a1, a1, a7 +; LMULMAX1-RV32-NEXT: mul a1, a1, t0 ; LMULMAX1-RV32-NEXT: srli a1, a1, 24 ; LMULMAX1-RV32-NEXT: .LBB7_12: ; LMULMAX1-RV32-NEXT: sw a1, 24(sp) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll index 762474381b247..cfd16b807eff0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -2208,8 +2208,8 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX2-RV64-NEXT: addi a1, zero, 1 -; LMULMAX2-RV64-NEXT: slli a6, a1, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a6 +; LMULMAX2-RV64-NEXT: slli a1, a1, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a1 ; LMULMAX2-RV64-NEXT: addi a3, a2, -1 ; LMULMAX2-RV64-NEXT: not a2, a2 ; LMULMAX2-RV64-NEXT: and a3, a2, a3 @@ -2221,8 +2221,8 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 ; LMULMAX2-RV64-NEXT: addi a2, a2, 1365 ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 -; LMULMAX2-RV64-NEXT: addi a7, a2, 1365 -; LMULMAX2-RV64-NEXT: and a4, a4, a7 +; LMULMAX2-RV64-NEXT: addi a6, a2, 1365 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 ; LMULMAX2-RV64-NEXT: sub a4, a3, a4 ; LMULMAX2-RV64-NEXT: lui a3, 13107 ; LMULMAX2-RV64-NEXT: addiw a3, a3, 819 @@ -2245,71 +2245,71 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX2-RV64-NEXT: slli a5, a5, 12 ; LMULMAX2-RV64-NEXT: addi a5, a5, 241 ; LMULMAX2-RV64-NEXT: slli a5, a5, 12 -; LMULMAX2-RV64-NEXT: addi a5, a5, -241 -; LMULMAX2-RV64-NEXT: and a4, a4, a5 -; LMULMAX2-RV64-NEXT: lui a1, 4112 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX2-RV64-NEXT: slli a1, a1, 16 -; LMULMAX2-RV64-NEXT: addi a1, a1, 257 -; LMULMAX2-RV64-NEXT: slli a1, a1, 16 -; LMULMAX2-RV64-NEXT: addi a1, a1, 257 -; LMULMAX2-RV64-NEXT: mul a4, a4, a1 +; LMULMAX2-RV64-NEXT: addi a7, a5, -241 +; LMULMAX2-RV64-NEXT: and a4, a4, a7 +; LMULMAX2-RV64-NEXT: lui a2, 4112 +; LMULMAX2-RV64-NEXT: addiw a2, a2, 257 +; LMULMAX2-RV64-NEXT: slli a2, a2, 16 +; LMULMAX2-RV64-NEXT: addi a2, a2, 257 +; LMULMAX2-RV64-NEXT: slli a2, a2, 16 +; LMULMAX2-RV64-NEXT: addi a2, a2, 257 +; LMULMAX2-RV64-NEXT: mul a4, a4, a2 ; LMULMAX2-RV64-NEXT: srli a4, a4, 56 ; LMULMAX2-RV64-NEXT: sw a4, 28(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a4, v26 -; LMULMAX2-RV64-NEXT: or a4, a4, a6 -; LMULMAX2-RV64-NEXT: addi a2, a4, -1 +; LMULMAX2-RV64-NEXT: or a4, a4, a1 +; LMULMAX2-RV64-NEXT: addi a5, a4, -1 ; LMULMAX2-RV64-NEXT: not a4, a4 -; LMULMAX2-RV64-NEXT: and a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a5 +; LMULMAX2-RV64-NEXT: srli a5, a4, 1 +; LMULMAX2-RV64-NEXT: and a5, a5, a6 +; LMULMAX2-RV64-NEXT: sub a4, a4, a5 +; LMULMAX2-RV64-NEXT: and a5, a4, a3 +; LMULMAX2-RV64-NEXT: srli a4, a4, 2 +; LMULMAX2-RV64-NEXT: and a4, a4, a3 +; LMULMAX2-RV64-NEXT: add a4, a5, a4 +; LMULMAX2-RV64-NEXT: srli a5, a4, 4 +; LMULMAX2-RV64-NEXT: add a4, a4, a5 ; LMULMAX2-RV64-NEXT: and a4, a4, a7 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a5 -; LMULMAX2-RV64-NEXT: mul a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 24(sp) +; LMULMAX2-RV64-NEXT: mul a4, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a4, 56 +; LMULMAX2-RV64-NEXT: sw a4, 24(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX2-RV64-NEXT: or a2, a2, a6 -; LMULMAX2-RV64-NEXT: addi a4, a2, -1 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: and a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 -; LMULMAX2-RV64-NEXT: and a4, a4, a7 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a4, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a2, a2, a5 -; LMULMAX2-RV64-NEXT: mul a2, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a2, 56 -; LMULMAX2-RV64-NEXT: sw a2, 20(sp) -; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25 -; LMULMAX2-RV64-NEXT: or a2, a2, a6 -; LMULMAX2-RV64-NEXT: addi a4, a2, -1 -; LMULMAX2-RV64-NEXT: not a2, a2 -; LMULMAX2-RV64-NEXT: and a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a4, v26 +; LMULMAX2-RV64-NEXT: or a4, a4, a1 +; LMULMAX2-RV64-NEXT: addi a5, a4, -1 +; LMULMAX2-RV64-NEXT: not a4, a4 +; LMULMAX2-RV64-NEXT: and a4, a4, a5 +; LMULMAX2-RV64-NEXT: srli a5, a4, 1 +; LMULMAX2-RV64-NEXT: and a5, a5, a6 +; LMULMAX2-RV64-NEXT: sub a4, a4, a5 +; LMULMAX2-RV64-NEXT: and a5, a4, a3 +; LMULMAX2-RV64-NEXT: srli a4, a4, 2 +; LMULMAX2-RV64-NEXT: and a4, a4, a3 +; LMULMAX2-RV64-NEXT: add a4, a5, a4 +; LMULMAX2-RV64-NEXT: srli a5, a4, 4 +; LMULMAX2-RV64-NEXT: add a4, a4, a5 ; LMULMAX2-RV64-NEXT: and a4, a4, a7 -; LMULMAX2-RV64-NEXT: sub a2, a2, a4 -; LMULMAX2-RV64-NEXT: and a4, a2, a3 -; LMULMAX2-RV64-NEXT: srli a2, a2, 2 -; LMULMAX2-RV64-NEXT: and a2, a2, a3 -; LMULMAX2-RV64-NEXT: add a2, a4, a2 -; LMULMAX2-RV64-NEXT: srli a3, a2, 4 -; LMULMAX2-RV64-NEXT: add a2, a2, a3 -; LMULMAX2-RV64-NEXT: and a2, a2, a5 -; LMULMAX2-RV64-NEXT: mul a1, a2, a1 +; LMULMAX2-RV64-NEXT: mul a4, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a4, 56 +; LMULMAX2-RV64-NEXT: sw a4, 20(sp) +; LMULMAX2-RV64-NEXT: vmv.x.s a4, v25 +; LMULMAX2-RV64-NEXT: or a1, a4, a1 +; LMULMAX2-RV64-NEXT: addi a4, a1, -1 +; LMULMAX2-RV64-NEXT: not a1, a1 +; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: srli a4, a1, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a1, a1, a4 +; LMULMAX2-RV64-NEXT: and a4, a1, a3 +; LMULMAX2-RV64-NEXT: srli a1, a1, 2 +; LMULMAX2-RV64-NEXT: and a1, a1, a3 +; LMULMAX2-RV64-NEXT: add a1, a4, a1 +; LMULMAX2-RV64-NEXT: srli a3, a1, 4 +; LMULMAX2-RV64-NEXT: add a1, a1, a3 +; LMULMAX2-RV64-NEXT: and a1, a1, a7 +; LMULMAX2-RV64-NEXT: mul a1, a1, a2 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: sw a1, 16(sp) ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -2422,8 +2422,8 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: addi a1, zero, 1 -; LMULMAX1-RV64-NEXT: slli a6, a1, 32 -; LMULMAX1-RV64-NEXT: or a2, a2, a6 +; LMULMAX1-RV64-NEXT: slli a1, a1, 32 +; LMULMAX1-RV64-NEXT: or a2, a2, a1 ; LMULMAX1-RV64-NEXT: addi a3, a2, -1 ; LMULMAX1-RV64-NEXT: not a2, a2 ; LMULMAX1-RV64-NEXT: and a3, a2, a3 @@ -2435,8 +2435,8 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX1-RV64-NEXT: slli a2, a2, 12 ; LMULMAX1-RV64-NEXT: addi a2, a2, 1365 ; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a7, a2, 1365 -; LMULMAX1-RV64-NEXT: and a4, a4, a7 +; LMULMAX1-RV64-NEXT: addi a6, a2, 1365 +; LMULMAX1-RV64-NEXT: and a4, a4, a6 ; LMULMAX1-RV64-NEXT: sub a4, a3, a4 ; LMULMAX1-RV64-NEXT: lui a3, 13107 ; LMULMAX1-RV64-NEXT: addiw a3, a3, 819 @@ -2459,71 +2459,71 @@ define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) { ; LMULMAX1-RV64-NEXT: slli a5, a5, 12 ; LMULMAX1-RV64-NEXT: addi a5, a5, 241 ; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, -241 -; LMULMAX1-RV64-NEXT: and a4, a4, a5 -; LMULMAX1-RV64-NEXT: lui a1, 4112 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX1-RV64-NEXT: slli a1, a1, 16 -; LMULMAX1-RV64-NEXT: addi a1, a1, 257 -; LMULMAX1-RV64-NEXT: slli a1, a1, 16 -; LMULMAX1-RV64-NEXT: addi a1, a1, 257 -; LMULMAX1-RV64-NEXT: mul a4, a4, a1 +; LMULMAX1-RV64-NEXT: addi a7, a5, -241 +; LMULMAX1-RV64-NEXT: and a4, a4, a7 +; LMULMAX1-RV64-NEXT: lui a2, 4112 +; LMULMAX1-RV64-NEXT: addiw a2, a2, 257 +; LMULMAX1-RV64-NEXT: slli a2, a2, 16 +; LMULMAX1-RV64-NEXT: addi a2, a2, 257 +; LMULMAX1-RV64-NEXT: slli a2, a2, 16 +; LMULMAX1-RV64-NEXT: addi a2, a2, 257 +; LMULMAX1-RV64-NEXT: mul a4, a4, a2 ; LMULMAX1-RV64-NEXT: srli a4, a4, 56 ; LMULMAX1-RV64-NEXT: sw a4, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 ; LMULMAX1-RV64-NEXT: vmv.x.s a4, v26 -; LMULMAX1-RV64-NEXT: or a4, a4, a6 -; LMULMAX1-RV64-NEXT: addi a2, a4, -1 +; LMULMAX1-RV64-NEXT: or a4, a4, a1 +; LMULMAX1-RV64-NEXT: addi a5, a4, -1 ; LMULMAX1-RV64-NEXT: not a4, a4 -; LMULMAX1-RV64-NEXT: and a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: and a4, a4, a5 +; LMULMAX1-RV64-NEXT: srli a5, a4, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a6 +; LMULMAX1-RV64-NEXT: sub a4, a4, a5 +; LMULMAX1-RV64-NEXT: and a5, a4, a3 +; LMULMAX1-RV64-NEXT: srli a4, a4, 2 +; LMULMAX1-RV64-NEXT: and a4, a4, a3 +; LMULMAX1-RV64-NEXT: add a4, a5, a4 +; LMULMAX1-RV64-NEXT: srli a5, a4, 4 +; LMULMAX1-RV64-NEXT: add a4, a4, a5 ; LMULMAX1-RV64-NEXT: and a4, a4, a7 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 24(sp) +; LMULMAX1-RV64-NEXT: mul a4, a4, a2 +; LMULMAX1-RV64-NEXT: srli a4, a4, 56 +; LMULMAX1-RV64-NEXT: sw a4, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: or a2, a2, a6 -; LMULMAX1-RV64-NEXT: addi a4, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 -; LMULMAX1-RV64-NEXT: and a4, a4, a7 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a4, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 20(sp) -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 -; LMULMAX1-RV64-NEXT: or a2, a2, a6 -; LMULMAX1-RV64-NEXT: addi a4, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a4, v26 +; LMULMAX1-RV64-NEXT: or a4, a4, a1 +; LMULMAX1-RV64-NEXT: addi a5, a4, -1 +; LMULMAX1-RV64-NEXT: not a4, a4 +; LMULMAX1-RV64-NEXT: and a4, a4, a5 +; LMULMAX1-RV64-NEXT: srli a5, a4, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a6 +; LMULMAX1-RV64-NEXT: sub a4, a4, a5 +; LMULMAX1-RV64-NEXT: and a5, a4, a3 +; LMULMAX1-RV64-NEXT: srli a4, a4, 2 +; LMULMAX1-RV64-NEXT: and a4, a4, a3 +; LMULMAX1-RV64-NEXT: add a4, a5, a4 +; LMULMAX1-RV64-NEXT: srli a5, a4, 4 +; LMULMAX1-RV64-NEXT: add a4, a4, a5 ; LMULMAX1-RV64-NEXT: and a4, a4, a7 -; LMULMAX1-RV64-NEXT: sub a2, a2, a4 -; LMULMAX1-RV64-NEXT: and a4, a2, a3 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: add a2, a4, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a1, a2, a1 +; LMULMAX1-RV64-NEXT: mul a4, a4, a2 +; LMULMAX1-RV64-NEXT: srli a4, a4, 56 +; LMULMAX1-RV64-NEXT: sw a4, 20(sp) +; LMULMAX1-RV64-NEXT: vmv.x.s a4, v25 +; LMULMAX1-RV64-NEXT: or a1, a4, a1 +; LMULMAX1-RV64-NEXT: addi a4, a1, -1 +; LMULMAX1-RV64-NEXT: not a1, a1 +; LMULMAX1-RV64-NEXT: and a1, a1, a4 +; LMULMAX1-RV64-NEXT: srli a4, a1, 1 +; LMULMAX1-RV64-NEXT: and a4, a4, a6 +; LMULMAX1-RV64-NEXT: sub a1, a1, a4 +; LMULMAX1-RV64-NEXT: and a4, a1, a3 +; LMULMAX1-RV64-NEXT: srli a1, a1, 2 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: add a1, a4, a1 +; LMULMAX1-RV64-NEXT: srli a3, a1, 4 +; LMULMAX1-RV64-NEXT: add a1, a1, a3 +; LMULMAX1-RV64-NEXT: and a1, a1, a7 +; LMULMAX1-RV64-NEXT: mul a1, a1, a2 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: sw a1, 16(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -7091,8 +7091,8 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 ; LMULMAX2-RV64-NEXT: addi a1, zero, 1 -; LMULMAX2-RV64-NEXT: slli a6, a1, 32 -; LMULMAX2-RV64-NEXT: or a2, a2, a6 +; LMULMAX2-RV64-NEXT: slli a1, a1, 32 +; LMULMAX2-RV64-NEXT: or a2, a2, a1 ; LMULMAX2-RV64-NEXT: addi a3, a2, -1 ; LMULMAX2-RV64-NEXT: not a2, a2 ; LMULMAX2-RV64-NEXT: and a3, a2, a3 @@ -7104,8 +7104,8 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 ; LMULMAX2-RV64-NEXT: addi a2, a2, 1365 ; LMULMAX2-RV64-NEXT: slli a2, a2, 12 -; LMULMAX2-RV64-NEXT: addi a7, a2, 1365 -; LMULMAX2-RV64-NEXT: and a4, a4, a7 +; LMULMAX2-RV64-NEXT: addi a6, a2, 1365 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 ; LMULMAX2-RV64-NEXT: sub a4, a3, a4 ; LMULMAX2-RV64-NEXT: lui a3, 13107 ; LMULMAX2-RV64-NEXT: addiw a3, a3, 819 @@ -7128,138 +7128,138 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 ; LMULMAX2-RV64-NEXT: addi a4, a4, 241 ; LMULMAX2-RV64-NEXT: slli a4, a4, 12 -; LMULMAX2-RV64-NEXT: addi a4, a4, -241 -; LMULMAX2-RV64-NEXT: and a1, a5, a4 +; LMULMAX2-RV64-NEXT: addi a7, a4, -241 +; LMULMAX2-RV64-NEXT: and a2, a5, a7 ; LMULMAX2-RV64-NEXT: lui a5, 4112 ; LMULMAX2-RV64-NEXT: addiw a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: slli a5, a5, 16 ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 60(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: sw a2, 60(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: or a1, a1, a6 -; LMULMAX2-RV64-NEXT: addi a2, a1, -1 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: and a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: or a2, a2, a1 +; LMULMAX2-RV64-NEXT: addi a4, a2, -1 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: and a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 56(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: sw a2, 56(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: or a1, a1, a6 -; LMULMAX2-RV64-NEXT: addi a2, a1, -1 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: and a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: or a2, a2, a1 +; LMULMAX2-RV64-NEXT: addi a4, a2, -1 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: and a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 52(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: sw a2, 52(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: or a1, a1, a6 -; LMULMAX2-RV64-NEXT: addi a2, a1, -1 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: and a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: or a2, a2, a1 +; LMULMAX2-RV64-NEXT: addi a4, a2, -1 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: and a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 48(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: sw a2, 48(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: or a1, a1, a6 -; LMULMAX2-RV64-NEXT: addi a2, a1, -1 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: and a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: or a2, a2, a1 +; LMULMAX2-RV64-NEXT: addi a4, a2, -1 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: and a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 44(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: sw a2, 44(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: or a1, a1, a6 -; LMULMAX2-RV64-NEXT: addi a2, a1, -1 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: and a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: or a2, a2, a1 +; LMULMAX2-RV64-NEXT: addi a4, a2, -1 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: and a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 40(sp) +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: sw a2, 40(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: or a1, a1, a6 -; LMULMAX2-RV64-NEXT: addi a2, a1, -1 -; LMULMAX2-RV64-NEXT: not a1, a1 -; LMULMAX2-RV64-NEXT: and a1, a1, a2 -; LMULMAX2-RV64-NEXT: srli a2, a1, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v28 +; LMULMAX2-RV64-NEXT: or a2, a2, a1 +; LMULMAX2-RV64-NEXT: addi a4, a2, -1 +; LMULMAX2-RV64-NEXT: not a2, a2 +; LMULMAX2-RV64-NEXT: and a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a2, 1 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: sub a2, a2, a4 +; LMULMAX2-RV64-NEXT: and a4, a2, a3 +; LMULMAX2-RV64-NEXT: srli a2, a2, 2 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: add a2, a4, a2 +; LMULMAX2-RV64-NEXT: srli a4, a2, 4 +; LMULMAX2-RV64-NEXT: add a2, a2, a4 ; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: sub a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a2, a1, a3 -; LMULMAX2-RV64-NEXT: srli a1, a1, 2 -; LMULMAX2-RV64-NEXT: and a1, a1, a3 -; LMULMAX2-RV64-NEXT: add a1, a2, a1 -; LMULMAX2-RV64-NEXT: srli a2, a1, 4 -; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 -; LMULMAX2-RV64-NEXT: mul a1, a1, a5 -; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sw a1, 36(sp) -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: or a1, a1, a6 +; LMULMAX2-RV64-NEXT: mul a2, a2, a5 +; LMULMAX2-RV64-NEXT: srli a2, a2, 56 +; LMULMAX2-RV64-NEXT: sw a2, 36(sp) +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 ; LMULMAX2-RV64-NEXT: addi a2, a1, -1 ; LMULMAX2-RV64-NEXT: not a1, a1 ; LMULMAX2-RV64-NEXT: and a1, a1, a2 ; LMULMAX2-RV64-NEXT: srli a2, a1, 1 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: and a2, a2, a6 ; LMULMAX2-RV64-NEXT: sub a1, a1, a2 ; LMULMAX2-RV64-NEXT: and a2, a1, a3 ; LMULMAX2-RV64-NEXT: srli a1, a1, 2 @@ -7267,7 +7267,7 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV64-NEXT: add a1, a2, a1 ; LMULMAX2-RV64-NEXT: srli a2, a1, 4 ; LMULMAX2-RV64-NEXT: add a1, a1, a2 -; LMULMAX2-RV64-NEXT: and a1, a1, a4 +; LMULMAX2-RV64-NEXT: and a1, a1, a7 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 ; LMULMAX2-RV64-NEXT: sw a1, 32(sp) @@ -7460,8 +7460,8 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: addi a2, zero, 1 -; LMULMAX1-RV64-NEXT: slli a7, a2, 32 -; LMULMAX1-RV64-NEXT: or a1, a1, a7 +; LMULMAX1-RV64-NEXT: slli a2, a2, 32 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 ; LMULMAX1-RV64-NEXT: addi a3, a1, -1 ; LMULMAX1-RV64-NEXT: not a1, a1 ; LMULMAX1-RV64-NEXT: and a1, a1, a3 @@ -7473,8 +7473,8 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: slli a3, a3, 12 ; LMULMAX1-RV64-NEXT: addi a3, a3, 1365 ; LMULMAX1-RV64-NEXT: slli a3, a3, 12 -; LMULMAX1-RV64-NEXT: addi t0, a3, 1365 -; LMULMAX1-RV64-NEXT: and a4, a4, t0 +; LMULMAX1-RV64-NEXT: addi a7, a3, 1365 +; LMULMAX1-RV64-NEXT: and a4, a4, a7 ; LMULMAX1-RV64-NEXT: sub a1, a1, a4 ; LMULMAX1-RV64-NEXT: lui a4, 13107 ; LMULMAX1-RV64-NEXT: addiw a4, a4, 819 @@ -7497,139 +7497,139 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: slli a5, a5, 12 ; LMULMAX1-RV64-NEXT: addi a5, a5, 241 ; LMULMAX1-RV64-NEXT: slli a5, a5, 12 -; LMULMAX1-RV64-NEXT: addi a5, a5, -241 -; LMULMAX1-RV64-NEXT: and a2, a1, a5 +; LMULMAX1-RV64-NEXT: addi t0, a5, -241 +; LMULMAX1-RV64-NEXT: and a3, a1, t0 ; LMULMAX1-RV64-NEXT: lui a1, 4112 ; LMULMAX1-RV64-NEXT: addiw a1, a1, 257 ; LMULMAX1-RV64-NEXT: slli a1, a1, 16 ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 ; LMULMAX1-RV64-NEXT: slli a1, a1, 16 ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 32(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: sw a3, 32(sp) ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e32, m1, ta, mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 -; LMULMAX1-RV64-NEXT: or a2, a2, a7 -; LMULMAX1-RV64-NEXT: addi a3, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 +; LMULMAX1-RV64-NEXT: or a3, a3, a2 +; LMULMAX1-RV64-NEXT: addi a5, a3, -1 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: and a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 44(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: sw a3, 44(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 -; LMULMAX1-RV64-NEXT: or a2, a2, a7 -; LMULMAX1-RV64-NEXT: addi a3, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v27 +; LMULMAX1-RV64-NEXT: or a3, a3, a2 +; LMULMAX1-RV64-NEXT: addi a5, a3, -1 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: and a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 40(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: sw a3, 40(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: or a2, a2, a7 -; LMULMAX1-RV64-NEXT: addi a3, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX1-RV64-NEXT: or a3, a3, a2 +; LMULMAX1-RV64-NEXT: addi a5, a3, -1 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: and a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 36(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: sw a3, 36(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: or a2, a2, a7 -; LMULMAX1-RV64-NEXT: addi a3, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX1-RV64-NEXT: or a3, a3, a2 +; LMULMAX1-RV64-NEXT: addi a5, a3, -1 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: and a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 28(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: sw a3, 28(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: or a2, a2, a7 -; LMULMAX1-RV64-NEXT: addi a3, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX1-RV64-NEXT: or a3, a3, a2 +; LMULMAX1-RV64-NEXT: addi a5, a3, -1 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: and a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 24(sp) +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: sw a3, 24(sp) ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 -; LMULMAX1-RV64-NEXT: or a2, a2, a7 -; LMULMAX1-RV64-NEXT: addi a3, a2, -1 -; LMULMAX1-RV64-NEXT: not a2, a2 -; LMULMAX1-RV64-NEXT: and a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a2, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX1-RV64-NEXT: or a3, a3, a2 +; LMULMAX1-RV64-NEXT: addi a5, a3, -1 +; LMULMAX1-RV64-NEXT: not a3, a3 +; LMULMAX1-RV64-NEXT: and a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a3, 1 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: sub a3, a3, a5 +; LMULMAX1-RV64-NEXT: and a5, a3, a4 +; LMULMAX1-RV64-NEXT: srli a3, a3, 2 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: add a3, a5, a3 +; LMULMAX1-RV64-NEXT: srli a5, a3, 4 +; LMULMAX1-RV64-NEXT: add a3, a3, a5 ; LMULMAX1-RV64-NEXT: and a3, a3, t0 -; LMULMAX1-RV64-NEXT: sub a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a3, a2, a4 -; LMULMAX1-RV64-NEXT: srli a2, a2, 2 -; LMULMAX1-RV64-NEXT: and a2, a2, a4 -; LMULMAX1-RV64-NEXT: add a2, a3, a2 -; LMULMAX1-RV64-NEXT: srli a3, a2, 4 -; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 -; LMULMAX1-RV64-NEXT: mul a2, a2, a1 -; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sw a2, 20(sp) -; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 -; LMULMAX1-RV64-NEXT: or a2, a2, a7 +; LMULMAX1-RV64-NEXT: mul a3, a3, a1 +; LMULMAX1-RV64-NEXT: srli a3, a3, 56 +; LMULMAX1-RV64-NEXT: sw a3, 20(sp) +; LMULMAX1-RV64-NEXT: vmv.x.s a3, v25 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 ; LMULMAX1-RV64-NEXT: addi a3, a2, -1 ; LMULMAX1-RV64-NEXT: not a2, a2 ; LMULMAX1-RV64-NEXT: and a2, a2, a3 ; LMULMAX1-RV64-NEXT: srli a3, a2, 1 -; LMULMAX1-RV64-NEXT: and a3, a3, t0 +; LMULMAX1-RV64-NEXT: and a3, a3, a7 ; LMULMAX1-RV64-NEXT: sub a2, a2, a3 ; LMULMAX1-RV64-NEXT: and a3, a2, a4 ; LMULMAX1-RV64-NEXT: srli a2, a2, 2 @@ -7637,7 +7637,7 @@ define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX1-RV64-NEXT: add a2, a3, a2 ; LMULMAX1-RV64-NEXT: srli a3, a2, 4 ; LMULMAX1-RV64-NEXT: add a2, a2, a3 -; LMULMAX1-RV64-NEXT: and a2, a2, a5 +; LMULMAX1-RV64-NEXT: and a2, a2, t0 ; LMULMAX1-RV64-NEXT: mul a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 ; LMULMAX1-RV64-NEXT: sw a1, 16(sp) @@ -7992,11 +7992,11 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 48 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi a7, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v26, (a7) +; LMULMAX1-RV32-NEXT: addi a6, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a6) ; LMULMAX1-RV32-NEXT: sw zero, 44(sp) ; LMULMAX1-RV32-NEXT: sw zero, 36(sp) -; LMULMAX1-RV32-NEXT: addi a6, zero, 32 +; LMULMAX1-RV32-NEXT: addi a7, zero, 32 ; LMULMAX1-RV32-NEXT: lui a1, 349525 ; LMULMAX1-RV32-NEXT: addi a5, a1, 1365 ; LMULMAX1-RV32-NEXT: lui a1, 209715 @@ -8009,7 +8009,7 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_2 ; LMULMAX1-RV32-NEXT: # %bb.1: ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a6 +; LMULMAX1-RV32-NEXT: vsrl.vx v27, v26, a7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 ; LMULMAX1-RV32-NEXT: not a1, a1 @@ -8051,7 +8051,7 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_5 ; LMULMAX1-RV32-NEXT: # %bb.4: -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v26, a6 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v26, a7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 ; LMULMAX1-RV32-NEXT: not a1, a1 @@ -8093,7 +8093,7 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: sw zero, 20(sp) ; LMULMAX1-RV32-NEXT: bnez a1, .LBB7_8 ; LMULMAX1-RV32-NEXT: # %bb.7: -; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a6 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v25, a7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 ; LMULMAX1-RV32-NEXT: not a1, a1 @@ -8134,7 +8134,7 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: sw a1, 16(sp) ; LMULMAX1-RV32-NEXT: bnez a2, .LBB7_11 ; LMULMAX1-RV32-NEXT: # %bb.10: -; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a6 +; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a7 ; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV32-NEXT: addi a2, a1, -1 ; LMULMAX1-RV32-NEXT: not a1, a1 @@ -8178,7 +8178,7 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v26, (a7) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a6) ; LMULMAX1-RV32-NEXT: addi sp, sp, 48 ; LMULMAX1-RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index 0261d3695975f..2ee97a5907f40 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -270,41 +270,41 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: fold_srem_vec_2: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a6, 12(a1) +; RV32IM-NEXT: lh a7, 12(a1) ; RV32IM-NEXT: lh a3, 8(a1) ; RV32IM-NEXT: lh a4, 0(a1) ; RV32IM-NEXT: lh a1, 4(a1) ; RV32IM-NEXT: lui a5, 706409 -; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a2, a4, a5 -; RV32IM-NEXT: add a2, a2, a4 -; RV32IM-NEXT: srli a7, a2, 31 -; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: add a2, a2, a7 -; RV32IM-NEXT: addi a7, zero, 95 -; RV32IM-NEXT: mul a2, a2, a7 +; RV32IM-NEXT: addi a6, a5, 389 +; RV32IM-NEXT: mulh a5, a4, a6 +; RV32IM-NEXT: add a5, a5, a4 +; RV32IM-NEXT: srli a2, a5, 31 +; RV32IM-NEXT: srli a5, a5, 6 +; RV32IM-NEXT: add a2, a5, a2 +; RV32IM-NEXT: addi a5, zero, 95 +; RV32IM-NEXT: mul a2, a2, a5 ; RV32IM-NEXT: sub t0, a4, a2 -; RV32IM-NEXT: mulh a4, a1, a5 +; RV32IM-NEXT: mulh a4, a1, a6 ; RV32IM-NEXT: add a4, a4, a1 ; RV32IM-NEXT: srli a2, a4, 31 ; RV32IM-NEXT: srli a4, a4, 6 ; RV32IM-NEXT: add a2, a4, a2 -; RV32IM-NEXT: mul a2, a2, a7 +; RV32IM-NEXT: mul a2, a2, a5 ; RV32IM-NEXT: sub a1, a1, a2 -; RV32IM-NEXT: mulh a2, a3, a5 +; RV32IM-NEXT: mulh a2, a3, a6 ; RV32IM-NEXT: add a2, a2, a3 ; RV32IM-NEXT: srli a4, a2, 31 ; RV32IM-NEXT: srli a2, a2, 6 ; RV32IM-NEXT: add a2, a2, a4 -; RV32IM-NEXT: mul a2, a2, a7 +; RV32IM-NEXT: mul a2, a2, a5 ; RV32IM-NEXT: sub a2, a3, a2 -; RV32IM-NEXT: mulh a3, a6, a5 -; RV32IM-NEXT: add a3, a3, a6 +; RV32IM-NEXT: mulh a3, a7, a6 +; RV32IM-NEXT: add a3, a3, a7 ; RV32IM-NEXT: srli a4, a3, 31 ; RV32IM-NEXT: srli a3, a3, 6 ; RV32IM-NEXT: add a3, a3, a4 -; RV32IM-NEXT: mul a3, a3, a7 -; RV32IM-NEXT: sub a3, a6, a3 +; RV32IM-NEXT: mul a3, a3, a5 +; RV32IM-NEXT: sub a3, a7, a3 ; RV32IM-NEXT: sh a3, 6(a0) ; RV32IM-NEXT: sh a2, 4(a0) ; RV32IM-NEXT: sh a1, 2(a0) @@ -357,8 +357,8 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: fold_srem_vec_2: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a6, 24(a1) -; RV64IM-NEXT: lh a7, 16(a1) +; RV64IM-NEXT: lh a7, 24(a1) +; RV64IM-NEXT: lh a3, 16(a1) ; RV64IM-NEXT: lh a4, 8(a1) ; RV64IM-NEXT: lh a1, 0(a1) ; RV64IM-NEXT: lui a5, 1045903 @@ -368,36 +368,36 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV64IM-NEXT: slli a5, a5, 12 ; RV64IM-NEXT: addi a5, a5, -905 ; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -1767 -; RV64IM-NEXT: mulh a2, a1, a5 -; RV64IM-NEXT: add a2, a2, a1 -; RV64IM-NEXT: srli a3, a2, 63 -; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: add a2, a2, a3 -; RV64IM-NEXT: addi a3, zero, 95 -; RV64IM-NEXT: mul a2, a2, a3 +; RV64IM-NEXT: addi a6, a5, -1767 +; RV64IM-NEXT: mulh a5, a1, a6 +; RV64IM-NEXT: add a5, a5, a1 +; RV64IM-NEXT: srli a2, a5, 63 +; RV64IM-NEXT: srli a5, a5, 6 +; RV64IM-NEXT: add a2, a5, a2 +; RV64IM-NEXT: addi a5, zero, 95 +; RV64IM-NEXT: mul a2, a2, a5 ; RV64IM-NEXT: sub t0, a1, a2 -; RV64IM-NEXT: mulh a2, a4, a5 +; RV64IM-NEXT: mulh a2, a4, a6 ; RV64IM-NEXT: add a2, a2, a4 ; RV64IM-NEXT: srli a1, a2, 63 ; RV64IM-NEXT: srli a2, a2, 6 ; RV64IM-NEXT: add a1, a2, a1 -; RV64IM-NEXT: mul a1, a1, a3 +; RV64IM-NEXT: mul a1, a1, a5 ; RV64IM-NEXT: sub a1, a4, a1 -; RV64IM-NEXT: mulh a2, a7, a5 -; RV64IM-NEXT: add a2, a2, a7 +; RV64IM-NEXT: mulh a2, a3, a6 +; RV64IM-NEXT: add a2, a2, a3 ; RV64IM-NEXT: srli a4, a2, 63 ; RV64IM-NEXT: srli a2, a2, 6 ; RV64IM-NEXT: add a2, a2, a4 -; RV64IM-NEXT: mul a2, a2, a3 -; RV64IM-NEXT: sub a2, a7, a2 -; RV64IM-NEXT: mulh a4, a6, a5 -; RV64IM-NEXT: add a4, a4, a6 -; RV64IM-NEXT: srli a5, a4, 63 -; RV64IM-NEXT: srli a4, a4, 6 -; RV64IM-NEXT: add a4, a4, a5 -; RV64IM-NEXT: mul a3, a4, a3 -; RV64IM-NEXT: sub a3, a6, a3 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a2, a3, a2 +; RV64IM-NEXT: mulh a3, a7, a6 +; RV64IM-NEXT: add a3, a3, a7 +; RV64IM-NEXT: srli a4, a3, 63 +; RV64IM-NEXT: srli a3, a3, 6 +; RV64IM-NEXT: add a3, a3, a4 +; RV64IM-NEXT: mul a3, a3, a5 +; RV64IM-NEXT: sub a3, a7, a3 ; RV64IM-NEXT: sh a3, 6(a0) ; RV64IM-NEXT: sh a2, 4(a0) ; RV64IM-NEXT: sh a1, 2(a0) @@ -484,49 +484,49 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: combine_srem_sdiv: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lh a6, 0(a1) +; RV32IM-NEXT: lh a7, 0(a1) ; RV32IM-NEXT: lh a3, 4(a1) ; RV32IM-NEXT: lh a4, 12(a1) ; RV32IM-NEXT: lh a1, 8(a1) ; RV32IM-NEXT: lui a5, 706409 -; RV32IM-NEXT: addi a5, a5, 389 -; RV32IM-NEXT: mulh a2, a4, a5 -; RV32IM-NEXT: add a2, a2, a4 -; RV32IM-NEXT: srli a7, a2, 31 -; RV32IM-NEXT: srai a2, a2, 6 -; RV32IM-NEXT: add t0, a2, a7 -; RV32IM-NEXT: addi a7, zero, 95 -; RV32IM-NEXT: mul a2, t0, a7 -; RV32IM-NEXT: sub t1, a4, a2 -; RV32IM-NEXT: mulh a4, a1, a5 -; RV32IM-NEXT: add a4, a4, a1 -; RV32IM-NEXT: srli a2, a4, 31 -; RV32IM-NEXT: srai a4, a4, 6 -; RV32IM-NEXT: add a2, a4, a2 -; RV32IM-NEXT: mul a4, a2, a7 -; RV32IM-NEXT: sub t2, a1, a4 -; RV32IM-NEXT: mulh a4, a3, a5 -; RV32IM-NEXT: add a4, a4, a3 -; RV32IM-NEXT: srli a1, a4, 31 -; RV32IM-NEXT: srai a4, a4, 6 -; RV32IM-NEXT: add a1, a4, a1 -; RV32IM-NEXT: mul a4, a1, a7 -; RV32IM-NEXT: sub a3, a3, a4 -; RV32IM-NEXT: mulh a4, a6, a5 -; RV32IM-NEXT: add a4, a4, a6 -; RV32IM-NEXT: srli a5, a4, 31 -; RV32IM-NEXT: srai a4, a4, 6 -; RV32IM-NEXT: add a4, a4, a5 -; RV32IM-NEXT: mul a5, a4, a7 -; RV32IM-NEXT: sub a5, a6, a5 +; RV32IM-NEXT: addi a6, a5, 389 +; RV32IM-NEXT: mulh a5, a4, a6 +; RV32IM-NEXT: add a5, a5, a4 +; RV32IM-NEXT: srli a2, a5, 31 +; RV32IM-NEXT: srai a5, a5, 6 +; RV32IM-NEXT: add t3, a5, a2 +; RV32IM-NEXT: addi t0, zero, 95 +; RV32IM-NEXT: mul a5, t3, t0 +; RV32IM-NEXT: sub t1, a4, a5 +; RV32IM-NEXT: mulh a5, a1, a6 +; RV32IM-NEXT: add a5, a5, a1 +; RV32IM-NEXT: srli a4, a5, 31 +; RV32IM-NEXT: srai a5, a5, 6 ; RV32IM-NEXT: add a4, a5, a4 +; RV32IM-NEXT: mul a5, a4, t0 +; RV32IM-NEXT: sub t2, a1, a5 +; RV32IM-NEXT: mulh a5, a3, a6 +; RV32IM-NEXT: add a5, a5, a3 +; RV32IM-NEXT: srli a1, a5, 31 +; RV32IM-NEXT: srai a5, a5, 6 +; RV32IM-NEXT: add a1, a5, a1 +; RV32IM-NEXT: mul a5, a1, t0 +; RV32IM-NEXT: sub a3, a3, a5 +; RV32IM-NEXT: mulh a5, a7, a6 +; RV32IM-NEXT: add a5, a5, a7 +; RV32IM-NEXT: srli a2, a5, 31 +; RV32IM-NEXT: srai a5, a5, 6 +; RV32IM-NEXT: add a2, a5, a2 +; RV32IM-NEXT: mul a5, a2, t0 +; RV32IM-NEXT: sub a5, a7, a5 +; RV32IM-NEXT: add a2, a5, a2 ; RV32IM-NEXT: add a1, a3, a1 -; RV32IM-NEXT: add a2, t2, a2 -; RV32IM-NEXT: add a3, t1, t0 -; RV32IM-NEXT: sh a3, 6(a0) -; RV32IM-NEXT: sh a2, 4(a0) +; RV32IM-NEXT: add a3, t2, a4 +; RV32IM-NEXT: add a4, t1, t3 +; RV32IM-NEXT: sh a4, 6(a0) +; RV32IM-NEXT: sh a3, 4(a0) ; RV32IM-NEXT: sh a1, 2(a0) -; RV32IM-NEXT: sh a4, 0(a0) +; RV32IM-NEXT: sh a2, 0(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: combine_srem_sdiv: @@ -603,8 +603,8 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: combine_srem_sdiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lh a6, 0(a1) -; RV64IM-NEXT: lh a7, 8(a1) +; RV64IM-NEXT: lh a7, 0(a1) +; RV64IM-NEXT: lh a3, 8(a1) ; RV64IM-NEXT: lh a4, 16(a1) ; RV64IM-NEXT: lh a1, 24(a1) ; RV64IM-NEXT: lui a5, 1045903 @@ -614,38 +614,38 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind { ; RV64IM-NEXT: slli a5, a5, 12 ; RV64IM-NEXT: addi a5, a5, -905 ; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, -1767 -; RV64IM-NEXT: mulh a2, a1, a5 -; RV64IM-NEXT: add a2, a2, a1 -; RV64IM-NEXT: srli a3, a2, 63 -; RV64IM-NEXT: srai a2, a2, 6 -; RV64IM-NEXT: add t3, a2, a3 +; RV64IM-NEXT: addi a6, a5, -1767 +; RV64IM-NEXT: mulh a5, a1, a6 +; RV64IM-NEXT: add a5, a5, a1 +; RV64IM-NEXT: srli a2, a5, 63 +; RV64IM-NEXT: srai a5, a5, 6 +; RV64IM-NEXT: add t3, a5, a2 ; RV64IM-NEXT: addi t0, zero, 95 -; RV64IM-NEXT: mul a3, t3, t0 -; RV64IM-NEXT: sub t1, a1, a3 -; RV64IM-NEXT: mulh a3, a4, a5 -; RV64IM-NEXT: add a3, a3, a4 -; RV64IM-NEXT: srli a1, a3, 63 -; RV64IM-NEXT: srai a3, a3, 6 -; RV64IM-NEXT: add a1, a3, a1 -; RV64IM-NEXT: mul a3, a1, t0 -; RV64IM-NEXT: sub t2, a4, a3 -; RV64IM-NEXT: mulh a4, a7, a5 -; RV64IM-NEXT: add a4, a4, a7 -; RV64IM-NEXT: srli a3, a4, 63 -; RV64IM-NEXT: srai a4, a4, 6 -; RV64IM-NEXT: add a3, a4, a3 -; RV64IM-NEXT: mul a4, a3, t0 -; RV64IM-NEXT: sub a4, a7, a4 -; RV64IM-NEXT: mulh a5, a6, a5 -; RV64IM-NEXT: add a5, a5, a6 +; RV64IM-NEXT: mul a5, t3, t0 +; RV64IM-NEXT: sub t1, a1, a5 +; RV64IM-NEXT: mulh a5, a4, a6 +; RV64IM-NEXT: add a5, a5, a4 +; RV64IM-NEXT: srli a1, a5, 63 +; RV64IM-NEXT: srai a5, a5, 6 +; RV64IM-NEXT: add a1, a5, a1 +; RV64IM-NEXT: mul a5, a1, t0 +; RV64IM-NEXT: sub t2, a4, a5 +; RV64IM-NEXT: mulh a5, a3, a6 +; RV64IM-NEXT: add a5, a5, a3 +; RV64IM-NEXT: srli a4, a5, 63 +; RV64IM-NEXT: srai a5, a5, 6 +; RV64IM-NEXT: add a4, a5, a4 +; RV64IM-NEXT: mul a5, a4, t0 +; RV64IM-NEXT: sub a3, a3, a5 +; RV64IM-NEXT: mulh a5, a7, a6 +; RV64IM-NEXT: add a5, a5, a7 ; RV64IM-NEXT: srli a2, a5, 63 ; RV64IM-NEXT: srai a5, a5, 6 ; RV64IM-NEXT: add a2, a5, a2 ; RV64IM-NEXT: mul a5, a2, t0 -; RV64IM-NEXT: sub a5, a6, a5 +; RV64IM-NEXT: sub a5, a7, a5 ; RV64IM-NEXT: add a2, a5, a2 -; RV64IM-NEXT: add a3, a4, a3 +; RV64IM-NEXT: add a3, a3, a4 ; RV64IM-NEXT: add a1, t2, a1 ; RV64IM-NEXT: add a4, t1, t3 ; RV64IM-NEXT: sh a4, 6(a0) diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index e91b0834e7d23..88dd2446b9b6c 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -261,41 +261,41 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: fold_urem_vec_2: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a6, 12(a1) -; RV32IM-NEXT: lhu a7, 8(a1) +; RV32IM-NEXT: lhu a7, 12(a1) +; RV32IM-NEXT: lhu a3, 8(a1) ; RV32IM-NEXT: lhu a4, 0(a1) ; RV32IM-NEXT: lhu a1, 4(a1) ; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 -; RV32IM-NEXT: mulhu a2, a4, a5 -; RV32IM-NEXT: sub a3, a4, a2 -; RV32IM-NEXT: srli a3, a3, 1 -; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: addi a6, a5, 777 +; RV32IM-NEXT: mulhu a5, a4, a6 +; RV32IM-NEXT: sub a2, a4, a5 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a2, a2, a5 ; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: addi a3, zero, 95 -; RV32IM-NEXT: mul a2, a2, a3 +; RV32IM-NEXT: addi a5, zero, 95 +; RV32IM-NEXT: mul a2, a2, a5 ; RV32IM-NEXT: sub t0, a4, a2 -; RV32IM-NEXT: mulhu a4, a1, a5 +; RV32IM-NEXT: mulhu a4, a1, a6 ; RV32IM-NEXT: sub a2, a1, a4 ; RV32IM-NEXT: srli a2, a2, 1 ; RV32IM-NEXT: add a2, a2, a4 ; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: mul a2, a2, a3 +; RV32IM-NEXT: mul a2, a2, a5 ; RV32IM-NEXT: sub a1, a1, a2 -; RV32IM-NEXT: mulhu a2, a7, a5 -; RV32IM-NEXT: sub a4, a7, a2 +; RV32IM-NEXT: mulhu a2, a3, a6 +; RV32IM-NEXT: sub a4, a3, a2 ; RV32IM-NEXT: srli a4, a4, 1 ; RV32IM-NEXT: add a2, a4, a2 ; RV32IM-NEXT: srli a2, a2, 6 -; RV32IM-NEXT: mul a2, a2, a3 -; RV32IM-NEXT: sub a2, a7, a2 -; RV32IM-NEXT: mulhu a4, a6, a5 -; RV32IM-NEXT: sub a5, a6, a4 -; RV32IM-NEXT: srli a5, a5, 1 -; RV32IM-NEXT: add a4, a5, a4 -; RV32IM-NEXT: srli a4, a4, 6 -; RV32IM-NEXT: mul a3, a4, a3 -; RV32IM-NEXT: sub a3, a6, a3 +; RV32IM-NEXT: mul a2, a2, a5 +; RV32IM-NEXT: sub a2, a3, a2 +; RV32IM-NEXT: mulhu a3, a7, a6 +; RV32IM-NEXT: sub a4, a7, a3 +; RV32IM-NEXT: srli a4, a4, 1 +; RV32IM-NEXT: add a3, a4, a3 +; RV32IM-NEXT: srli a3, a3, 6 +; RV32IM-NEXT: mul a3, a3, a5 +; RV32IM-NEXT: sub a3, a7, a3 ; RV32IM-NEXT: sh a3, 6(a0) ; RV32IM-NEXT: sh a2, 4(a0) ; RV32IM-NEXT: sh a1, 2(a0) @@ -348,8 +348,8 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: fold_urem_vec_2: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a6, 24(a1) -; RV64IM-NEXT: lhu a7, 16(a1) +; RV64IM-NEXT: lhu a7, 24(a1) +; RV64IM-NEXT: lhu a3, 16(a1) ; RV64IM-NEXT: lhu a4, 8(a1) ; RV64IM-NEXT: lhu a1, 0(a1) ; RV64IM-NEXT: lui a5, 1423 @@ -359,36 +359,36 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; RV64IM-NEXT: slli a5, a5, 13 ; RV64IM-NEXT: addi a5, a5, -1811 ; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 561 -; RV64IM-NEXT: mulhu a2, a1, a5 -; RV64IM-NEXT: sub a3, a1, a2 -; RV64IM-NEXT: srli a3, a3, 1 -; RV64IM-NEXT: add a2, a3, a2 +; RV64IM-NEXT: addi a6, a5, 561 +; RV64IM-NEXT: mulhu a5, a1, a6 +; RV64IM-NEXT: sub a2, a1, a5 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a2, a2, a5 ; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: addi a3, zero, 95 -; RV64IM-NEXT: mul a2, a2, a3 +; RV64IM-NEXT: addi a5, zero, 95 +; RV64IM-NEXT: mul a2, a2, a5 ; RV64IM-NEXT: sub t0, a1, a2 -; RV64IM-NEXT: mulhu a2, a4, a5 +; RV64IM-NEXT: mulhu a2, a4, a6 ; RV64IM-NEXT: sub a1, a4, a2 ; RV64IM-NEXT: srli a1, a1, 1 ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: srli a1, a1, 6 -; RV64IM-NEXT: mul a1, a1, a3 +; RV64IM-NEXT: mul a1, a1, a5 ; RV64IM-NEXT: sub a1, a4, a1 -; RV64IM-NEXT: mulhu a2, a7, a5 -; RV64IM-NEXT: sub a4, a7, a2 +; RV64IM-NEXT: mulhu a2, a3, a6 +; RV64IM-NEXT: sub a4, a3, a2 ; RV64IM-NEXT: srli a4, a4, 1 ; RV64IM-NEXT: add a2, a4, a2 ; RV64IM-NEXT: srli a2, a2, 6 -; RV64IM-NEXT: mul a2, a2, a3 -; RV64IM-NEXT: sub a2, a7, a2 -; RV64IM-NEXT: mulhu a4, a6, a5 -; RV64IM-NEXT: sub a5, a6, a4 -; RV64IM-NEXT: srli a5, a5, 1 -; RV64IM-NEXT: add a4, a5, a4 -; RV64IM-NEXT: srli a4, a4, 6 -; RV64IM-NEXT: mul a3, a4, a3 -; RV64IM-NEXT: sub a3, a6, a3 +; RV64IM-NEXT: mul a2, a2, a5 +; RV64IM-NEXT: sub a2, a3, a2 +; RV64IM-NEXT: mulhu a3, a7, a6 +; RV64IM-NEXT: sub a4, a7, a3 +; RV64IM-NEXT: srli a4, a4, 1 +; RV64IM-NEXT: add a3, a4, a3 +; RV64IM-NEXT: srli a3, a3, 6 +; RV64IM-NEXT: mul a3, a3, a5 +; RV64IM-NEXT: sub a3, a7, a3 ; RV64IM-NEXT: sh a3, 6(a0) ; RV64IM-NEXT: sh a2, 4(a0) ; RV64IM-NEXT: sh a1, 2(a0) @@ -475,44 +475,44 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; ; RV32IM-LABEL: combine_urem_udiv: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a6, 0(a1) -; RV32IM-NEXT: lhu a7, 4(a1) +; RV32IM-NEXT: lhu a7, 0(a1) +; RV32IM-NEXT: lhu a3, 4(a1) ; RV32IM-NEXT: lhu a4, 12(a1) ; RV32IM-NEXT: lhu a1, 8(a1) ; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 -; RV32IM-NEXT: mulhu a2, a4, a5 -; RV32IM-NEXT: sub a3, a4, a2 -; RV32IM-NEXT: srli a3, a3, 1 -; RV32IM-NEXT: add a2, a3, a2 +; RV32IM-NEXT: addi a6, a5, 777 +; RV32IM-NEXT: mulhu a5, a4, a6 +; RV32IM-NEXT: sub a2, a4, a5 +; RV32IM-NEXT: srli a2, a2, 1 +; RV32IM-NEXT: add a2, a2, a5 ; RV32IM-NEXT: srli t3, a2, 6 ; RV32IM-NEXT: addi t0, zero, 95 -; RV32IM-NEXT: mul a3, t3, t0 -; RV32IM-NEXT: sub t1, a4, a3 -; RV32IM-NEXT: mulhu a4, a1, a5 -; RV32IM-NEXT: sub a3, a1, a4 -; RV32IM-NEXT: srli a3, a3, 1 -; RV32IM-NEXT: add a3, a3, a4 -; RV32IM-NEXT: srli a3, a3, 6 -; RV32IM-NEXT: mul a4, a3, t0 -; RV32IM-NEXT: sub t2, a1, a4 -; RV32IM-NEXT: mulhu a4, a7, a5 -; RV32IM-NEXT: sub a1, a7, a4 +; RV32IM-NEXT: mul a5, t3, t0 +; RV32IM-NEXT: sub t1, a4, a5 +; RV32IM-NEXT: mulhu a5, a1, a6 +; RV32IM-NEXT: sub a4, a1, a5 +; RV32IM-NEXT: srli a4, a4, 1 +; RV32IM-NEXT: add a4, a4, a5 +; RV32IM-NEXT: srli a4, a4, 6 +; RV32IM-NEXT: mul a5, a4, t0 +; RV32IM-NEXT: sub t2, a1, a5 +; RV32IM-NEXT: mulhu a5, a3, a6 +; RV32IM-NEXT: sub a1, a3, a5 ; RV32IM-NEXT: srli a1, a1, 1 -; RV32IM-NEXT: add a1, a1, a4 +; RV32IM-NEXT: add a1, a1, a5 ; RV32IM-NEXT: srli a1, a1, 6 -; RV32IM-NEXT: mul a4, a1, t0 -; RV32IM-NEXT: sub a4, a7, a4 -; RV32IM-NEXT: mulhu a5, a6, a5 -; RV32IM-NEXT: sub a2, a6, a5 +; RV32IM-NEXT: mul a5, a1, t0 +; RV32IM-NEXT: sub a3, a3, a5 +; RV32IM-NEXT: mulhu a5, a7, a6 +; RV32IM-NEXT: sub a2, a7, a5 ; RV32IM-NEXT: srli a2, a2, 1 ; RV32IM-NEXT: add a2, a2, a5 ; RV32IM-NEXT: srli a2, a2, 6 ; RV32IM-NEXT: mul a5, a2, t0 -; RV32IM-NEXT: sub a5, a6, a5 +; RV32IM-NEXT: sub a5, a7, a5 ; RV32IM-NEXT: add a2, a5, a2 -; RV32IM-NEXT: add a1, a4, a1 -; RV32IM-NEXT: add a3, t2, a3 +; RV32IM-NEXT: add a1, a3, a1 +; RV32IM-NEXT: add a3, t2, a4 ; RV32IM-NEXT: add a4, t1, t3 ; RV32IM-NEXT: sh a4, 6(a0) ; RV32IM-NEXT: sh a3, 4(a0) @@ -594,8 +594,8 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; ; RV64IM-LABEL: combine_urem_udiv: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a6, 0(a1) -; RV64IM-NEXT: lhu a7, 8(a1) +; RV64IM-NEXT: lhu a7, 0(a1) +; RV64IM-NEXT: lhu a3, 8(a1) ; RV64IM-NEXT: lhu a4, 16(a1) ; RV64IM-NEXT: lhu a1, 24(a1) ; RV64IM-NEXT: lui a5, 1423 @@ -605,38 +605,38 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind { ; RV64IM-NEXT: slli a5, a5, 13 ; RV64IM-NEXT: addi a5, a5, -1811 ; RV64IM-NEXT: slli a5, a5, 12 -; RV64IM-NEXT: addi a5, a5, 561 -; RV64IM-NEXT: mulhu a2, a1, a5 -; RV64IM-NEXT: sub a3, a1, a2 -; RV64IM-NEXT: srli a3, a3, 1 -; RV64IM-NEXT: add a2, a3, a2 +; RV64IM-NEXT: addi a6, a5, 561 +; RV64IM-NEXT: mulhu a5, a1, a6 +; RV64IM-NEXT: sub a2, a1, a5 +; RV64IM-NEXT: srli a2, a2, 1 +; RV64IM-NEXT: add a2, a2, a5 ; RV64IM-NEXT: srli t3, a2, 6 ; RV64IM-NEXT: addi t0, zero, 95 -; RV64IM-NEXT: mul a3, t3, t0 -; RV64IM-NEXT: sub t1, a1, a3 -; RV64IM-NEXT: mulhu a3, a4, a5 -; RV64IM-NEXT: sub a1, a4, a3 +; RV64IM-NEXT: mul a5, t3, t0 +; RV64IM-NEXT: sub t1, a1, a5 +; RV64IM-NEXT: mulhu a5, a4, a6 +; RV64IM-NEXT: sub a1, a4, a5 ; RV64IM-NEXT: srli a1, a1, 1 -; RV64IM-NEXT: add a1, a1, a3 +; RV64IM-NEXT: add a1, a1, a5 ; RV64IM-NEXT: srli a1, a1, 6 -; RV64IM-NEXT: mul a3, a1, t0 -; RV64IM-NEXT: sub t2, a4, a3 -; RV64IM-NEXT: mulhu a4, a7, a5 -; RV64IM-NEXT: sub a3, a7, a4 -; RV64IM-NEXT: srli a3, a3, 1 -; RV64IM-NEXT: add a3, a3, a4 -; RV64IM-NEXT: srli a3, a3, 6 -; RV64IM-NEXT: mul a4, a3, t0 -; RV64IM-NEXT: sub a4, a7, a4 -; RV64IM-NEXT: mulhu a5, a6, a5 -; RV64IM-NEXT: sub a2, a6, a5 +; RV64IM-NEXT: mul a5, a1, t0 +; RV64IM-NEXT: sub t2, a4, a5 +; RV64IM-NEXT: mulhu a5, a3, a6 +; RV64IM-NEXT: sub a4, a3, a5 +; RV64IM-NEXT: srli a4, a4, 1 +; RV64IM-NEXT: add a4, a4, a5 +; RV64IM-NEXT: srli a4, a4, 6 +; RV64IM-NEXT: mul a5, a4, t0 +; RV64IM-NEXT: sub a3, a3, a5 +; RV64IM-NEXT: mulhu a5, a7, a6 +; RV64IM-NEXT: sub a2, a7, a5 ; RV64IM-NEXT: srli a2, a2, 1 ; RV64IM-NEXT: add a2, a2, a5 ; RV64IM-NEXT: srli a2, a2, 6 ; RV64IM-NEXT: mul a5, a2, t0 -; RV64IM-NEXT: sub a5, a6, a5 +; RV64IM-NEXT: sub a5, a7, a5 ; RV64IM-NEXT: add a2, a5, a2 -; RV64IM-NEXT: add a3, a4, a3 +; RV64IM-NEXT: add a3, a3, a4 ; RV64IM-NEXT: add a1, t2, a1 ; RV64IM-NEXT: add a4, t1, t3 ; RV64IM-NEXT: sh a4, 6(a0) diff --git a/llvm/test/CodeGen/Thumb/dyn-stackalloc.ll b/llvm/test/CodeGen/Thumb/dyn-stackalloc.ll index c6b5c7b3513d2..2e755109a9559 100644 --- a/llvm/test/CodeGen/Thumb/dyn-stackalloc.ll +++ b/llvm/test/CodeGen/Thumb/dyn-stackalloc.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=RA_GREEDY -; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=RA_BASIC +; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -verify-machineinstrs | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK %struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* } %struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* } @@ -45,8 +45,7 @@ define void @t2(%struct.comment* %vc, i8* %tag, i8* %contents) { ; CHECK: sub sp, # ; CHECK: mov r[[R0:[0-9]+]], sp ; CHECK: str r{{[0-9+]}}, [r[[R0]] -; RA_GREEDY: str r{{[0-9+]}}, [r[[R0]] -; RA_BASIC: stm r[[R0]]! +; CHECK: str r{{[0-9+]}}, [r[[R0]] ; CHECK-NOT: ldr r0, [sp ; CHECK: mov r[[R1:[0-9]+]], sp ; CHECK: subs r[[R2:[0-9]+]], r[[R1]], r{{[0-9]+}} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll index 73865945cdc35..be68cb3cf63a9 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-disabled-in-loloops.ll @@ -10,7 +10,7 @@ define dso_local void @check_option(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { ; ENABLED-LABEL: check_option: ; ENABLED: @ %bb.0: @ %entry -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; ENABLED-NEXT: cmp r3, #1 ; ENABLED-NEXT: blt .LBB0_4 ; ENABLED-NEXT: @ %bb.1: @ %vector.ph.preheader @@ -32,11 +32,11 @@ define dso_local void @check_option(i32* noalias nocapture %A, i32* noalias noca ; ENABLED-NEXT: letp lr, .LBB0_3 ; ENABLED-NEXT: b .LBB0_2 ; ENABLED-NEXT: .LBB0_4: @ %for.cond.cleanup -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} ; ; DISABLED-LABEL: check_option: ; DISABLED: @ %bb.0: @ %entry -; DISABLED-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; DISABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; DISABLED-NEXT: cmp r3, #1 ; DISABLED-NEXT: blt .LBB0_4 ; DISABLED-NEXT: @ %bb.1: @ %vector.ph.preheader @@ -48,7 +48,7 @@ define dso_local void @check_option(i32* noalias nocapture %A, i32* noalias noca ; DISABLED-NEXT: .LBB0_2: @ %vector.ph ; DISABLED-NEXT: @ =>This Loop Header: Depth=1 ; DISABLED-NEXT: @ Child Loop BB0_3 Depth 2 -; DISABLED-NEXT: mov r7, r8 +; DISABLED-NEXT: mov r9, r8 ; DISABLED-NEXT: mov r12, r0 ; DISABLED-NEXT: mov r4, r2 ; DISABLED-NEXT: mov r5, r1 @@ -57,9 +57,9 @@ define dso_local void @check_option(i32* noalias nocapture %A, i32* noalias noca ; DISABLED-NEXT: .LBB0_3: @ %vector.body ; DISABLED-NEXT: @ Parent Loop BB0_2 Depth=1 ; DISABLED-NEXT: @ => This Inner Loop Header: Depth=2 -; DISABLED-NEXT: mov lr, r7 +; DISABLED-NEXT: mov lr, r9 ; DISABLED-NEXT: vctp.32 r6 -; DISABLED-NEXT: subs r7, #1 +; DISABLED-NEXT: sub.w r9, r9, #1 ; DISABLED-NEXT: subs r6, #4 ; DISABLED-NEXT: vpstt ; DISABLED-NEXT: vldrwt.u32 q0, [r5], #16 @@ -70,7 +70,7 @@ define dso_local void @check_option(i32* noalias nocapture %A, i32* noalias noca ; DISABLED-NEXT: le lr, .LBB0_3 ; DISABLED-NEXT: b .LBB0_2 ; DISABLED-NEXT: .LBB0_4: @ %for.cond.cleanup -; DISABLED-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; DISABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: %cmp8 = icmp sgt i32 %N, 0 %0 = add i32 %N, 3 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index af5c76fd44770..13376500baadf 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -17,17 +17,15 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr { ; ENABLED-LABEL: varying_outer_2d_reduction: ; ENABLED: @ %bb.0: @ %entry -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; ENABLED-NEXT: sub sp, #4 +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} ; ENABLED-NEXT: cmp r3, #1 -; ENABLED-NEXT: str r0, [sp] @ 4-byte Spill ; ENABLED-NEXT: blt .LBB0_8 ; ENABLED-NEXT: @ %bb.1: @ %for.body.lr.ph -; ENABLED-NEXT: ldr r0, [sp, #36] -; ENABLED-NEXT: add.w r12, r2, #3 -; ENABLED-NEXT: ldr.w r10, [sp] @ 4-byte Reload -; ENABLED-NEXT: mov.w r8, #0 -; ENABLED-NEXT: mov r9, r12 +; ENABLED-NEXT: mov r11, r0 +; ENABLED-NEXT: ldr r0, [sp, #32] +; ENABLED-NEXT: add.w r9, r2, #3 +; ENABLED-NEXT: mov.w r12, #0 +; ENABLED-NEXT: mov r10, r11 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 ; ENABLED-NEXT: b .LBB0_4 @@ -37,31 +35,32 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: lsrs r0, r0, #16 ; ENABLED-NEXT: sub.w r9, r9, #1 -; ENABLED-NEXT: strh.w r0, [r1, r8, lsl #1] -; ENABLED-NEXT: add.w r8, r8, #1 +; ENABLED-NEXT: strh.w r0, [r1, r12, lsl #1] +; ENABLED-NEXT: add.w r12, r12, #1 ; ENABLED-NEXT: add.w r10, r10, #2 -; ENABLED-NEXT: cmp r8, r3 +; ENABLED-NEXT: cmp r12, r3 ; ENABLED-NEXT: beq .LBB0_8 ; ENABLED-NEXT: .LBB0_4: @ %for.body ; ENABLED-NEXT: @ =>This Loop Header: Depth=1 ; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2 -; ENABLED-NEXT: cmp r2, r8 +; ENABLED-NEXT: cmp r2, r12 ; ENABLED-NEXT: ble .LBB0_2 ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 ; ENABLED-NEXT: movs r7, #1 ; ENABLED-NEXT: subs r0, #4 -; ENABLED-NEXT: sub.w r4, r2, r8 +; ENABLED-NEXT: sub.w r4, r2, r12 ; ENABLED-NEXT: vmov.i32 q1, #0x0 ; ENABLED-NEXT: add.w r6, r7, r0, lsr #2 -; ENABLED-NEXT: sub.w r0, r12, r8 +; ENABLED-NEXT: adds r0, r2, #3 +; ENABLED-NEXT: sub.w r0, r0, r12 ; ENABLED-NEXT: bic r0, r0, #3 ; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 ; ENABLED-NEXT: mov r7, r10 ; ENABLED-NEXT: dls lr, r0 -; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload +; ENABLED-NEXT: mov r0, r11 ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 @@ -83,22 +82,19 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 ; ENABLED-NEXT: .LBB0_8: @ %for.end17 -; ENABLED-NEXT: add sp, #4 -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, pc} ; ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction: ; NOREDUCTIONS: @ %bb.0: @ %entry -; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; NOREDUCTIONS-NEXT: sub sp, #4 +; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: cmp r3, #1 -; NOREDUCTIONS-NEXT: str r0, [sp] @ 4-byte Spill ; NOREDUCTIONS-NEXT: blt .LBB0_8 ; NOREDUCTIONS-NEXT: @ %bb.1: @ %for.body.lr.ph -; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] -; NOREDUCTIONS-NEXT: add.w r12, r2, #3 -; NOREDUCTIONS-NEXT: ldr.w r10, [sp] @ 4-byte Reload -; NOREDUCTIONS-NEXT: mov.w r8, #0 -; NOREDUCTIONS-NEXT: mov r9, r12 +; NOREDUCTIONS-NEXT: mov r11, r0 +; NOREDUCTIONS-NEXT: ldr r0, [sp, #32] +; NOREDUCTIONS-NEXT: add.w r9, r2, #3 +; NOREDUCTIONS-NEXT: mov.w r12, #0 +; NOREDUCTIONS-NEXT: mov r10, r11 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 ; NOREDUCTIONS-NEXT: b .LBB0_4 @@ -108,31 +104,32 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: lsrs r0, r0, #16 ; NOREDUCTIONS-NEXT: sub.w r9, r9, #1 -; NOREDUCTIONS-NEXT: strh.w r0, [r1, r8, lsl #1] -; NOREDUCTIONS-NEXT: add.w r8, r8, #1 +; NOREDUCTIONS-NEXT: strh.w r0, [r1, r12, lsl #1] +; NOREDUCTIONS-NEXT: add.w r12, r12, #1 ; NOREDUCTIONS-NEXT: add.w r10, r10, #2 -; NOREDUCTIONS-NEXT: cmp r8, r3 +; NOREDUCTIONS-NEXT: cmp r12, r3 ; NOREDUCTIONS-NEXT: beq .LBB0_8 ; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body ; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1 ; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2 -; NOREDUCTIONS-NEXT: cmp r2, r8 +; NOREDUCTIONS-NEXT: cmp r2, r12 ; NOREDUCTIONS-NEXT: ble .LBB0_2 ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 ; NOREDUCTIONS-NEXT: movs r7, #1 ; NOREDUCTIONS-NEXT: subs r0, #4 -; NOREDUCTIONS-NEXT: sub.w r4, r2, r8 +; NOREDUCTIONS-NEXT: sub.w r4, r2, r12 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 ; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2 -; NOREDUCTIONS-NEXT: sub.w r0, r12, r8 +; NOREDUCTIONS-NEXT: adds r0, r2, #3 +; NOREDUCTIONS-NEXT: sub.w r0, r0, r12 ; NOREDUCTIONS-NEXT: bic r0, r0, #3 ; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r7, r10 ; NOREDUCTIONS-NEXT: dls lr, r0 -; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload +; NOREDUCTIONS-NEXT: mov r0, r11 ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 @@ -154,8 +151,7 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 ; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17 -; NOREDUCTIONS-NEXT: add sp, #4 -; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, pc} entry: %conv = sext i16 %N to i32 %cmp36 = icmp sgt i16 %N, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll index fba800bc3a5f0..810c25442df79 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll @@ -160,31 +160,27 @@ define dso_local i32 @b(i32* %c, i32 %d, i32 %e, i32* %n) "frame-pointer"="all" ; CHECK-NEXT: add r7, sp, #12 ; CHECK-NEXT: .save {r8, r9, r10, r11} ; CHECK-NEXT: push.w {r8, r9, r10, r11} -; CHECK-NEXT: .pad #12 -; CHECK-NEXT: sub sp, #12 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: wls lr, r1, .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: adds r2, r3, #4 +; CHECK-NEXT: adds r4, r3, #4 ; CHECK-NEXT: add.w r9, r0, #4 ; CHECK-NEXT: mvn r11, #1 ; CHECK-NEXT: @ implicit-def: $r6 ; CHECK-NEXT: @ implicit-def: $r12 -; CHECK-NEXT: str r4, [sp] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r1, [r9, #-4] -; CHECK-NEXT: ldr.w r10, [r2] +; CHECK-NEXT: ldr.w r10, [r4] ; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: muls r1, r3, r1 ; CHECK-NEXT: adds.w r8, r1, #-2147483648 ; CHECK-NEXT: asr.w r5, r1, #31 ; CHECK-NEXT: adc r1, r5, #0 ; CHECK-NEXT: mul r5, r10, r0 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: ldr.w r2, [r11, #4] -; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: add.w r5, r5, #-2147483648 ; CHECK-NEXT: asrl r8, r1, r5 ; CHECK-NEXT: smull r4, r5, r10, r8 @@ -193,47 +189,48 @@ define dso_local i32 @b(i32* %c, i32 %d, i32 %e, i32* %n) "frame-pointer"="all" ; CHECK-NEXT: mov r4, r5 ; CHECK-NEXT: lsll r4, r1, r10 ; CHECK-NEXT: lsll r4, r1, #30 -; CHECK-NEXT: ldr.w r4, [r11] +; CHECK-NEXT: ldrd r4, r8, [r11] ; CHECK-NEXT: asrs r5, r1, #31 -; CHECK-NEXT: mov r8, r1 ; CHECK-NEXT: muls r4, r6, r4 -; CHECK-NEXT: adds r4, #2 -; CHECK-NEXT: lsll r8, r5, r4 +; CHECK-NEXT: adds r2, r4, #2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: lsll r4, r5, r2 +; CHECK-NEXT: add.w r1, r4, #-2147483648 ; CHECK-NEXT: ldr r4, [r9], #4 ; CHECK-NEXT: asr.w r5, r12, #31 -; CHECK-NEXT: add.w r8, r8, #-2147483648 ; CHECK-NEXT: muls r4, r3, r4 ; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: adds.w r1, r12, r4 +; CHECK-NEXT: adds.w r2, r12, r4 ; CHECK-NEXT: adc.w r5, r5, r4, asr #31 -; CHECK-NEXT: smull r6, r4, r2, r6 -; CHECK-NEXT: adds.w r1, r1, #-2147483648 -; CHECK-NEXT: adc r1, r5, #0 -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: asrs r5, r1, #31 -; CHECK-NEXT: subs r6, r1, r6 +; CHECK-NEXT: smull r6, r4, r8, r6 +; CHECK-NEXT: adds.w r2, r2, #-2147483648 +; CHECK-NEXT: adc r2, r5, #0 +; CHECK-NEXT: asrs r5, r2, #31 +; CHECK-NEXT: subs r6, r2, r6 ; CHECK-NEXT: sbcs r5, r4 ; CHECK-NEXT: adds.w r6, r6, #-2147483648 ; CHECK-NEXT: adc r5, r5, #0 -; CHECK-NEXT: asrl r6, r5, r8 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: asrl r6, r5, r1 +; CHECK-NEXT: movs r1, #2 ; CHECK-NEXT: lsrl r6, r5, #2 -; CHECK-NEXT: movs r5, #2 -; CHECK-NEXT: str r6, [r5] -; CHECK-NEXT: ldr r5, [r11], #-4 -; CHECK-NEXT: mls r1, r5, r10, r1 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: str r6, [r1] +; CHECK-NEXT: ldr r1, [r11], #-4 +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: mls r1, r1, r10, r2 ; CHECK-NEXT: adds.w r12, r1, #-2147483648 -; CHECK-NEXT: asr.w r4, r1, #31 -; CHECK-NEXT: adc r1, r4, #0 -; CHECK-NEXT: ldrd r4, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: asr.w r2, r1, #31 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload ; CHECK-NEXT: lsrl r12, r1, #2 ; CHECK-NEXT: rsb.w r1, r12, #0 -; CHECK-NEXT: adds r0, #4 -; CHECK-NEXT: str r1, [r4] -; CHECK-NEXT: str r1, [r2, #-4] -; CHECK-NEXT: adds r2, #4 +; CHECK-NEXT: str r1, [r2] +; CHECK-NEXT: str r1, [r4, #-4] +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: .LBB2_3: @ %while.end -; CHECK-NEXT: add sp, #12 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: pop.w {r8, r9, r10, r11} ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll b/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll index e6beb751fbb85..fe72b501b39a9 100644 --- a/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll +++ b/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll @@ -31,8 +31,8 @@ define %union.rec* @Manifest(%union.rec* %x, %union.rec* %env, %struct.STYLE* %s ; CHECK-NEXT: ldrd r8, lr, [r7, #20] ; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: ldm.w r10, {r4, r9, r10} -; CHECK-NEXT: ldr.w r12, [r7, #28] +; CHECK-NEXT: ldm.w r10, {r4, r6, r10} +; CHECK-NEXT: ldrd r12, r9, [r7, #28] ; CHECK-NEXT: ittt ne ; CHECK-NEXT: addne sp, #292 ; CHECK-NEXT: popne.w {r8, r10, r11} @@ -46,29 +46,25 @@ define %union.rec* @Manifest(%union.rec* %x, %union.rec* %env, %struct.STYLE* %s ; CHECK-NEXT: @ %bb.3: @ %bb420 ; CHECK-NEXT: movw r5, :lower16:(L_zz_hold$non_lazy_ptr-(LPC0_0+4)) ; CHECK-NEXT: movt r5, :upper16:(L_zz_hold$non_lazy_ptr-(LPC0_0+4)) -; CHECK-NEXT: movw r11, :lower16:(L_zz_res$non_lazy_ptr-(LPC0_1+4)) ; CHECK-NEXT: LPC0_0: ; CHECK-NEXT: add r5, pc -; CHECK-NEXT: movt r11, :upper16:(L_zz_res$non_lazy_ptr-(LPC0_1+4)) +; CHECK-NEXT: ldr.w r11, [r5] +; CHECK-NEXT: str.w r11, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: movw r5, :lower16:(L_zz_res$non_lazy_ptr-(LPC0_1+4)) +; CHECK-NEXT: movt r5, :upper16:(L_zz_res$non_lazy_ptr-(LPC0_1+4)) ; CHECK-NEXT: LPC0_1: -; CHECK-NEXT: add r11, pc +; CHECK-NEXT: add r5, pc ; CHECK-NEXT: ldr r5, [r5] ; CHECK-NEXT: str r5, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: ldr.w r5, [r11] -; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: str r5, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: ldr r5, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: str.w r11, [r5] ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: ldr r6, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: str r5, [r6] -; CHECK-NEXT: ldr r5, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: str.w r5, [r11] +; CHECK-NEXT: ldr.w r11, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: str.w r5, [r11] +; CHECK-NEXT: ldr r5, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: str r0, [r5] -; CHECK-NEXT: ldr r0, [r7, #32] -; CHECK-NEXT: stm.w sp, {r4, r9, r10} +; CHECK-NEXT: stm.w sp, {r4, r6, r10} ; CHECK-NEXT: strd r8, lr, [sp, #12] -; CHECK-NEXT: str.w r12, [sp, #20] -; CHECK-NEXT: str r0, [sp, #24] +; CHECK-NEXT: strd r12, r9, [sp, #20] ; CHECK-NEXT: bl _Manifest ; CHECK-NEXT: trap ; CHECK-NEXT: LBB0_4: @ %bb20 diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll index cc8a3b36c8305..b5c6c216affa3 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -1049,10 +1049,10 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #20 +; CHECK-NEXT: sub sp, #20 ; CHECK-NEXT: cmp r3, #8 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: blo.w .LBB16_12 ; CHECK-NEXT: @ %bb.1: @ %entry ; CHECK-NEXT: lsrs.w r12, r3, #2 @@ -1072,45 +1072,43 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: subs r1, r7, #2 ; CHECK-NEXT: rsbs r7, r4, #0 -; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: add.w r7, r3, #16 -; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r4, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: b .LBB16_5 ; CHECK-NEXT: .LBB16_3: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: wls lr, r0, .LBB16_4 ; CHECK-NEXT: b .LBB16_9 ; CHECK-NEXT: .LBB16_4: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: subs.w r12, r12, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #8 -; CHECK-NEXT: add.w r0, r5, r0, lsl #1 +; CHECK-NEXT: add.w r0, r6, r0, lsl #1 ; CHECK-NEXT: add.w r5, r0, #8 ; CHECK-NEXT: beq.w .LBB16_12 ; CHECK-NEXT: .LBB16_5: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB16_7 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: ldrh.w lr, [r3, #14] ; CHECK-NEXT: vldrw.u32 q0, [r0], #8 -; CHECK-NEXT: ldrh.w r8, [r3, #12] +; CHECK-NEXT: ldrh.w r10, [r3, #12] ; CHECK-NEXT: ldrh r7, [r3, #10] ; CHECK-NEXT: ldrh r4, [r3, #8] ; CHECK-NEXT: ldrh r6, [r3, #6] ; CHECK-NEXT: ldrh.w r9, [r3, #4] ; CHECK-NEXT: ldrh.w r11, [r3, #2] -; CHECK-NEXT: ldrh.w r10, [r3] +; CHECK-NEXT: ldrh.w r8, [r3] ; CHECK-NEXT: vstrb.8 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: adds r0, r5, #2 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmul.f16 q0, q0, r10 +; CHECK-NEXT: vmul.f16 q0, q0, r8 ; CHECK-NEXT: adds r0, r5, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r11 ; CHECK-NEXT: vldrw.u32 q1, [r5, #4] @@ -1119,77 +1117,77 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca ; CHECK-NEXT: add.w r0, r5, #10 ; CHECK-NEXT: vfma.f16 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q1, [r5, #8] +; CHECK-NEXT: add.w r6, r5, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: add.w r0, r5, #14 ; CHECK-NEXT: vfma.f16 q0, q1, r7 ; CHECK-NEXT: vldrw.u32 q1, [r5, #12] -; CHECK-NEXT: adds r5, #16 -; CHECK-NEXT: vfma.f16 q0, q1, r8 +; CHECK-NEXT: vfma.f16 q0, q1, r10 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vfma.f16 q0, q1, lr ; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: blo .LBB16_8 ; CHECK-NEXT: @ %bb.6: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w r5, r3, #16 ; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB16_7: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh r0, [r6], #16 -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: adds r4, r5, #2 +; CHECK-NEXT: ldrh r0, [r5], #16 +; CHECK-NEXT: vldrw.u32 q1, [r6] +; CHECK-NEXT: adds r4, r6, #2 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r6, #-14] -; CHECK-NEXT: adds r4, r5, #6 +; CHECK-NEXT: ldrh r0, [r5, #-14] +; CHECK-NEXT: adds r4, r6, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #-12] -; CHECK-NEXT: vldrw.u32 q1, [r5, #4] +; CHECK-NEXT: ldrh r0, [r5, #-12] +; CHECK-NEXT: vldrw.u32 q1, [r6, #4] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r6, #-10] -; CHECK-NEXT: add.w r4, r5, #10 +; CHECK-NEXT: ldrh r0, [r5, #-10] +; CHECK-NEXT: add.w r4, r6, #10 ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #-8] -; CHECK-NEXT: vldrw.u32 q1, [r5, #8] +; CHECK-NEXT: ldrh r0, [r5, #-8] +; CHECK-NEXT: vldrw.u32 q1, [r6, #8] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r6, #-6] -; CHECK-NEXT: ldrh r4, [r6, #-2] +; CHECK-NEXT: ldrh r0, [r5, #-6] +; CHECK-NEXT: ldrh r4, [r5, #-2] ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #-4] -; CHECK-NEXT: vldrw.u32 q1, [r5, #12] +; CHECK-NEXT: ldrh r0, [r5, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r6, #12] ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: add.w r0, r5, #14 +; CHECK-NEXT: add.w r0, r6, #14 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: adds r6, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: le lr, .LBB16_7 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add.w r5, r3, #16 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: .LBB16_10: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh r4, [r6], #2 +; CHECK-NEXT: ldrh r4, [r5], #2 ; CHECK-NEXT: vldrh.u16 q1, [r0], #2 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: le lr, .LBB16_10 ; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r5, r0, lsl #1 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add.w r6, r6, r0, lsl #1 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_12: @ %if.end -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: add sp, #20 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index 2d73c7531fe69..39ff830e7be63 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1044,8 +1044,8 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: cmp r3, #8 ; CHECK-NEXT: blo.w .LBB16_12 ; CHECK-NEXT: @ %bb.1: @ %entry @@ -1053,38 +1053,36 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc ; CHECK-NEXT: beq.w .LBB16_12 ; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph ; CHECK-NEXT: ldrh r6, [r0] -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: ldrd r4, r10, [r0, #4] +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: ldrd r7, r10, [r0, #4] ; CHECK-NEXT: sub.w r0, r6, #8 ; CHECK-NEXT: add.w r3, r0, r0, lsr #29 ; CHECK-NEXT: and r0, r0, #7 -; CHECK-NEXT: asrs r7, r3, #3 -; CHECK-NEXT: cmp r7, #1 +; CHECK-NEXT: asrs r5, r3, #3 +; CHECK-NEXT: cmp r5, #1 ; CHECK-NEXT: it gt -; CHECK-NEXT: asrgt r5, r3, #3 -; CHECK-NEXT: add.w r3, r4, r6, lsl #2 +; CHECK-NEXT: asrgt r4, r3, #3 +; CHECK-NEXT: add.w r3, r7, r6, lsl #2 ; CHECK-NEXT: sub.w r9, r3, #4 ; CHECK-NEXT: rsbs r3, r6, #0 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: add.w r3, r10, #32 -; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r4, [sp] @ 4-byte Spill +; CHECK-NEXT: str r6, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: b .LBB16_5 ; CHECK-NEXT: .LBB16_3: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldrd r0, r9, [sp, #12] @ 8-byte Folded Reload ; CHECK-NEXT: wls lr, r0, .LBB16_4 ; CHECK-NEXT: b .LBB16_9 ; CHECK-NEXT: .LBB16_4: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: subs.w r12, r12, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: add.w r0, r4, r0, lsl #2 -; CHECK-NEXT: add.w r4, r0, #16 +; CHECK-NEXT: add.w r0, r7, r0, lsl #2 +; CHECK-NEXT: add.w r7, r0, #16 ; CHECK-NEXT: beq .LBB16_12 ; CHECK-NEXT: .LBB16_5: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -1092,79 +1090,79 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc ; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 ; CHECK-NEXT: add.w lr, r10, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: ldrd r3, r7, [r10] +; CHECK-NEXT: ldrd r3, r4, [r10] ; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr} ; CHECK-NEXT: ldrd r11, r8, [r10, #24] ; CHECK-NEXT: vstrb.8 q0, [r9], #16 -; CHECK-NEXT: vldrw.u32 q0, [r4], #32 -; CHECK-NEXT: strd r9, r1, [sp, #24] @ 8-byte Folded Spill -; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] +; CHECK-NEXT: vldrw.u32 q0, [r7], #32 +; CHECK-NEXT: strd r9, r1, [sp, #16] @ 8-byte Folded Spill +; CHECK-NEXT: vldrw.u32 q1, [r7, #-28] ; CHECK-NEXT: vmul.f32 q0, q0, r3 -; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] -; CHECK-NEXT: vldrw.u32 q4, [r4, #-20] -; CHECK-NEXT: vfma.f32 q0, q1, r7 -; CHECK-NEXT: vldrw.u32 q5, [r4, #-16] +; CHECK-NEXT: vldrw.u32 q6, [r7, #-24] +; CHECK-NEXT: vldrw.u32 q4, [r7, #-20] +; CHECK-NEXT: vfma.f32 q0, q1, r4 +; CHECK-NEXT: vldrw.u32 q5, [r7, #-16] ; CHECK-NEXT: vfma.f32 q0, q6, r0 -; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] +; CHECK-NEXT: vldrw.u32 q2, [r7, #-12] ; CHECK-NEXT: vfma.f32 q0, q4, r5 -; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] +; CHECK-NEXT: vldrw.u32 q3, [r7, #-8] ; CHECK-NEXT: vfma.f32 q0, q5, r6 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vfma.f32 q0, q2, lr -; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r7, #-4] ; CHECK-NEXT: vfma.f32 q0, q3, r11 ; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: vfma.f32 q0, q1, r8 ; CHECK-NEXT: blo .LBB16_8 ; CHECK-NEXT: @ %bb.6: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w r4, r10, #32 ; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: .LBB16_7: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11} -; CHECK-NEXT: vldrw.u32 q1, [r4], #32 -; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] -; CHECK-NEXT: vldrw.u32 q4, [r4, #-20] +; CHECK-NEXT: ldm.w r4, {r0, r3, r5, r6, r8, r11} +; CHECK-NEXT: vldrw.u32 q1, [r7], #32 +; CHECK-NEXT: vldrw.u32 q6, [r7, #-24] +; CHECK-NEXT: vldrw.u32 q4, [r7, #-20] ; CHECK-NEXT: vfma.f32 q0, q1, r0 -; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] -; CHECK-NEXT: vldrw.u32 q5, [r4, #-16] -; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] +; CHECK-NEXT: vldrw.u32 q1, [r7, #-28] +; CHECK-NEXT: vldrw.u32 q5, [r7, #-16] +; CHECK-NEXT: vldrw.u32 q2, [r7, #-12] ; CHECK-NEXT: vfma.f32 q0, q1, r3 -; CHECK-NEXT: ldrd r9, r1, [r7, #24] +; CHECK-NEXT: ldrd r9, r1, [r4, #24] ; CHECK-NEXT: vfma.f32 q0, q6, r5 -; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] +; CHECK-NEXT: vldrw.u32 q3, [r7, #-8] ; CHECK-NEXT: vfma.f32 q0, q4, r6 -; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r7, #-4] ; CHECK-NEXT: vfma.f32 q0, q5, r8 -; CHECK-NEXT: adds r7, #32 +; CHECK-NEXT: adds r4, #32 ; CHECK-NEXT: vfma.f32 q0, q2, r11 ; CHECK-NEXT: vfma.f32 q0, q3, r9 ; CHECK-NEXT: vfma.f32 q0, q1, r1 ; CHECK-NEXT: le lr, .LBB16_7 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: add.w r4, r10, #32 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: .LBB16_10: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr r0, [r7], #4 +; CHECK-NEXT: ldr r0, [r4], #4 ; CHECK-NEXT: vldrw.u32 q1, [r3], #4 ; CHECK-NEXT: vfma.f32 q0, q1, r0 ; CHECK-NEXT: le lr, .LBB16_10 ; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add.w r4, r4, r0, lsl #2 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add.w r7, r7, r0, lsl #2 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_12: @ %if.end -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll index 303ee3a5c19e7..17a47e5ec54c7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -105,26 +105,26 @@ define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill -; CHECK-NEXT: ldr r1, [r0, #4] -; CHECK-NEXT: subs r1, #2 -; CHECK-NEXT: cmp r1, #2 +; CHECK-NEXT: ldr r3, [r0, #4] +; CHECK-NEXT: subs r3, #2 +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: cmp r3, #2 ; CHECK-NEXT: blo .LBB1_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: ldr.w r12, [r0, #8] ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: ldr r3, [r0] ; CHECK-NEXT: add.w r11, r3, r12, lsl #2 -; CHECK-NEXT: add.w r7, r3, r12, lsl #3 -; CHECK-NEXT: lsl.w r9, r12, #3 +; CHECK-NEXT: add.w r6, r3, r12, lsl #3 +; CHECK-NEXT: lsl.w r10, r12, #3 ; CHECK-NEXT: .LBB1_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 -; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w r10, r4, #1 +; CHECK-NEXT: add.w r9, r4, #1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB1_3: @ %vector.body @@ -139,19 +139,20 @@ define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: vadd.f32 s8, s2, s3 -; CHECK-NEXT: add.w r0, r2, r10, lsl #2 +; CHECK-NEXT: add.w r0, r2, r9, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: add r11, r9 +; CHECK-NEXT: add r11, r10 ; CHECK-NEXT: vadd.f32 s2, s6, s7 -; CHECK-NEXT: add r7, r9 +; CHECK-NEXT: add r6, r10 ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: vadd.f32 s0, s0, s8 ; CHECK-NEXT: vadd.f32 s2, s4, s2 ; CHECK-NEXT: vstr s0, [r0] ; CHECK-NEXT: add.w r0, r2, r4, lsl #2 ; CHECK-NEXT: adds r4, #2 -; CHECK-NEXT: cmp r4, r1 ; CHECK-NEXT: vstr s2, [r0] +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: cmp r4, r0 ; CHECK-NEXT: blo .LBB1_2 ; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #4 @@ -231,46 +232,40 @@ define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] -; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: subs r1, #3 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo .LBB2_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r9, [r0, #8] ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r0, r3, r3, lsl #1 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 -; CHECK-NEXT: add.w r12, r1, r3, lsl #3 -; CHECK-NEXT: adds r3, #3 -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add.w r10, r1, r0, lsl #2 -; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: add.w r0, r9, r9, lsl #1 +; CHECK-NEXT: add.w r10, r1, r9, lsl #2 +; CHECK-NEXT: add.w r12, r1, r9, lsl #3 +; CHECK-NEXT: add.w r8, r1, r0, lsl #2 +; CHECK-NEXT: add.w r1, r9, #3 +; CHECK-NEXT: bic r1, r1, #3 ; CHECK-NEXT: lsl.w r11, r0, #2 -; CHECK-NEXT: add.w r1, r5, r3, lsr #2 -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: add.w r1, r5, r1, lsr #2 +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB2_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 -; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: adds r0, r5, #2 -; CHECK-NEXT: adds r2, r5, #1 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: mov r4, r10 +; CHECK-NEXT: mov r4, r8 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: dlstp.32 lr, r7 +; CHECK-NEXT: dlstp.32 lr, r9 ; CHECK-NEXT: .LBB2_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -285,31 +280,31 @@ define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1 ; CHECK-NEXT: vadd.f32 s12, s10, s11 -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: adds r0, r5, #1 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: add r9, r11 +; CHECK-NEXT: add r10, r11 ; CHECK-NEXT: vadd.f32 s10, s6, s7 -; CHECK-NEXT: add.w r0, r1, r2, lsl #2 +; CHECK-NEXT: add.w r0, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: add r12, r11 ; CHECK-NEXT: vadd.f32 s6, s2, s3 -; CHECK-NEXT: add r10, r11 +; CHECK-NEXT: add r8, r11 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s2, s8, s12 ; CHECK-NEXT: vadd.f32 s4, s4, s10 ; CHECK-NEXT: vadd.f32 s0, s0, s6 ; CHECK-NEXT: vstr s2, [r0] -; CHECK-NEXT: add.w r0, r1, r5, lsl #2 -; CHECK-NEXT: adds r5, #3 +; CHECK-NEXT: add.w r0, r2, r5, lsl #2 ; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add.w r0, r1, r0, lsl #2 +; CHECK-NEXT: adds r0, r5, #2 +; CHECK-NEXT: adds r5, #3 +; CHECK-NEXT: add.w r0, r2, r0, lsl #2 ; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: blo .LBB2_2 ; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -401,15 +396,15 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] -; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 -; CHECK-NEXT: blo.w .LBB3_5 +; CHECK-NEXT: blo .LBB3_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: ldr r2, [r0, #8] ; CHECK-NEXT: movs r6, #1 @@ -417,31 +412,25 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: add.w r0, r2, r2, lsl #1 ; CHECK-NEXT: add.w r12, r1, r2, lsl #2 ; CHECK-NEXT: add.w r8, r1, r2, lsl #3 -; CHECK-NEXT: add.w r9, r1, r2, lsl #4 -; CHECK-NEXT: add.w r11, r1, r0, lsl #2 +; CHECK-NEXT: add.w r10, r1, r2, lsl #4 +; CHECK-NEXT: add.w r9, r1, r0, lsl #2 ; CHECK-NEXT: adds r0, r2, #3 ; CHECK-NEXT: bic r0, r0, #3 ; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: add.w r0, r6, r0, lsr #2 -; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r2, [sp, #4] @ 8-byte Folded Spill ; CHECK-NEXT: lsls r0, r2, #4 -; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldrd r2, r7, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB3_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 -; CHECK-NEXT: adds r0, r6, #3 -; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill -; CHECK-NEXT: adds r0, r6, #2 -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: adds r0, r6, #1 -; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r5, r11 -; CHECK-NEXT: mov r4, r9 +; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: mov r4, r10 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov q3, q0 @@ -462,9 +451,9 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 ; CHECK-NEXT: vadd.f32 s16, s14, s15 -; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: adds r0, r6, #1 ; CHECK-NEXT: vadd.f32 s14, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 @@ -478,24 +467,24 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vadd.f32 s0, s0, s6 ; CHECK-NEXT: vstr s2, [r0] ; CHECK-NEXT: add.w r0, r1, r6, lsl #2 -; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: vstr s8, [r0] -; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: adds r0, r6, #2 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: adds r0, r6, #3 +; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload ; CHECK-NEXT: add r12, r0 ; CHECK-NEXT: add r8, r0 -; CHECK-NEXT: add r11, r0 ; CHECK-NEXT: add r9, r0 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: add r10, r0 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: blo .LBB3_2 ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -600,48 +589,41 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #5 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB4_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] -; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: ldr r1, [r0, #8] +; CHECK-NEXT: ldr r3, [r0] +; CHECK-NEXT: adds r0, r1, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r8, r1, r3, lsl #2 -; CHECK-NEXT: subs r1, r0, #4 +; CHECK-NEXT: add.w r12, r3, r1, lsl #2 +; CHECK-NEXT: subs r3, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r5, r3, #2 -; CHECK-NEXT: add.w r1, r0, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: add.w r1, r3, r3, lsl #2 -; CHECK-NEXT: lsls r1, r1, #2 -; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: lsls r5, r1, #2 +; CHECK-NEXT: add.w r3, r0, r3, lsr #2 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r3, r1, r1, lsl #2 +; CHECK-NEXT: lsls r3, r3, #2 +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB4_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB4_3 Depth 2 -; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: add.w r10, r0, #2 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: add.w r11, r0, #1 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: dlstp.32 lr, r7 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB4_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -649,11 +631,11 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vldrw.u32 q5, [r4], #16 ; CHECK-NEXT: vldrw.u32 q6, [r3], #16 ; CHECK-NEXT: vfma.f32 q3, q6, q5 -; CHECK-NEXT: add.w r12, r9, r5 +; CHECK-NEXT: add.w r10, r9, r5 ; CHECK-NEXT: vldrw.u32 q6, [r9] ; CHECK-NEXT: vfma.f32 q4, q6, q5 -; CHECK-NEXT: add.w r6, r12, r5 -; CHECK-NEXT: vldrw.u32 q6, [r12] +; CHECK-NEXT: add.w r6, r10, r5 +; CHECK-NEXT: vldrw.u32 q6, [r10] ; CHECK-NEXT: vfma.f32 q2, q6, q5 ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vldrw.u32 q6, [r6] @@ -664,7 +646,7 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1 ; CHECK-NEXT: vadd.f32 s20, s18, s19 -; CHECK-NEXT: add.w r1, r2, r11, lsl #2 +; CHECK-NEXT: add.w r3, r2, r11, lsl #2 ; CHECK-NEXT: vadd.f32 s16, s16, s17 ; CHECK-NEXT: vadd.f32 s18, s14, s15 ; CHECK-NEXT: vadd.f32 s12, s12, s13 @@ -679,25 +661,26 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vadd.f32 s4, s4, s14 ; CHECK-NEXT: vadd.f32 s6, s8, s6 ; CHECK-NEXT: vadd.f32 s0, s0, s10 -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: add.w r1, r2, r0, lsl #2 +; CHECK-NEXT: vstr s2, [r3] +; CHECK-NEXT: add.w r3, r2, r0, lsl #2 +; CHECK-NEXT: vstr s12, [r3] +; CHECK-NEXT: adds r3, r0, #2 +; CHECK-NEXT: add.w r3, r2, r3, lsl #2 +; CHECK-NEXT: vstr s6, [r3] +; CHECK-NEXT: adds r3, r0, #3 +; CHECK-NEXT: add.w r3, r2, r3, lsl #2 +; CHECK-NEXT: vstr s0, [r3] +; CHECK-NEXT: adds r3, r0, #4 ; CHECK-NEXT: adds r0, #5 -; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: add.w r1, r2, r10, lsl #2 -; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s0, [r1] -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: add.w r1, r2, r1, lsl #2 -; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add r8, r1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: blo.w .LBB4_2 +; CHECK-NEXT: add.w r3, r2, r3, lsl #2 +; CHECK-NEXT: vstr s4, [r3] +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload +; CHECK-NEXT: add r12, r3 +; CHECK-NEXT: ldr r3, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: cmp r0, r3 +; CHECK-NEXT: blo .LBB4_2 ; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -815,63 +798,54 @@ define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #6 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB5_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r9, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r9, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r8, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r1, r9, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r5, r3, #2 +; CHECK-NEXT: lsl.w r5, r9, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r1, r3, r3, lsl #1 +; CHECK-NEXT: add.w r1, r9, r9, lsl #1 ; CHECK-NEXT: lsls r1, r1, #3 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB5_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_3 Depth 2 -; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: add.w r11, r0, #2 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #1 -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov q5, q1 ; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: dlstp.32 lr, r7 +; CHECK-NEXT: dlstp.32 lr, r9 ; CHECK-NEXT: .LBB5_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r12, r3, r5 +; CHECK-NEXT: add.w r10, r3, r5 ; CHECK-NEXT: vldrw.u32 q6, [r1], #16 ; CHECK-NEXT: vldrw.u32 q7, [r3], #16 ; CHECK-NEXT: vfma.f32 q4, q7, q6 -; CHECK-NEXT: add.w r10, r12, r5 -; CHECK-NEXT: vldrw.u32 q7, [r12] -; CHECK-NEXT: vfma.f32 q5, q7, q6 -; CHECK-NEXT: add.w r6, r10, r5 +; CHECK-NEXT: add.w r11, r10, r5 ; CHECK-NEXT: vldrw.u32 q7, [r10] +; CHECK-NEXT: vfma.f32 q5, q7, q6 +; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: vldrw.u32 q7, [r11] ; CHECK-NEXT: vfma.f32 q2, q7, q6 ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vldrw.u32 q7, [r6] @@ -905,26 +879,27 @@ define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vadd.f32 s0, s0, s10 ; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: adds r0, #6 ; CHECK-NEXT: vstr s1, [r1] -; CHECK-NEXT: add.w r1, r2, r11, lsl #2 +; CHECK-NEXT: adds r1, r0, #2 +; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s0, [r1] -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #5 +; CHECK-NEXT: adds r0, #6 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: add r8, r1 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add r12, r1 +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB5_2 ; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -1055,107 +1030,97 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #72 -; CHECK-NEXT: sub sp, #72 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #7 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB6_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r10, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r10, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 +; CHECK-NEXT: add.w r8, r1, r10, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r5, r3, #2 +; CHECK-NEXT: lsl.w r5, r10, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: rsb r1, r3, r3, lsl #3 -; CHECK-NEXT: lsls r1, r1, #2 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: rsb r1, r10, r10, lsl #3 +; CHECK-NEXT: lsls r1, r1, #2 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB6_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_3 Depth 2 -; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #2 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: add.w r8, r0, #1 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: add.w r12, r0, #1 +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: vmov q5, q2 ; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vmov q6, q2 ; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: mov r12, r7 -; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: dls lr, r6 +; CHECK-NEXT: mov r9, r10 +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB6_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r10, r3, r5 -; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: add.w r11, r3, r5 +; CHECK-NEXT: vctp.32 r9 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrwt.u32 q7, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 ; CHECK-NEXT: vfmat.f32 q5, q0, q7 -; CHECK-NEXT: add.w r11, r10, r5 +; CHECK-NEXT: add.w r6, r11, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r10] +; CHECK-NEXT: vldrwt.u32 q0, [r11] ; CHECK-NEXT: vfmat.f32 q6, q0, q7 -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r11] +; CHECK-NEXT: vldrwt.u32 q0, [r6] ; CHECK-NEXT: vfmat.f32 q1, q0, q7 -; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vmov q5, q4 ; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: vmov q2, q3 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r6] +; CHECK-NEXT: vldrwt.u32 q0, [r7] ; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload -; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q1, q0, q7 -; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vmov q2, q4 ; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: adds r6, r7, r5 +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: sub.w r9, r9, #4 +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r7] +; CHECK-NEXT: vldrwt.u32 q0, [r6] ; CHECK-NEXT: vfmat.f32 q3, q0, q7 -; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q0, [r6] -; CHECK-NEXT: vfmat.f32 q4, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r7] +; CHECK-NEXT: vfmat.f32 q4, q0, q7 +; CHECK-NEXT: vldrwt.u32 q0, [r6] ; CHECK-NEXT: vfmat.f32 q2, q0, q7 ; CHECK-NEXT: le lr, .LBB6_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1 ; CHECK-NEXT: vadd.f32 s0, s26, s27 -; CHECK-NEXT: add.w r1, r2, r8, lsl #2 +; CHECK-NEXT: add.w r1, r2, r12, lsl #2 ; CHECK-NEXT: vadd.f32 s2, s24, s25 ; CHECK-NEXT: vadd.f32 s3, s20, s21 ; CHECK-NEXT: vadd.f32 s1, s22, s23 @@ -1165,43 +1130,43 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vadd.f32 s14, s6, s7 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vadd.f32 s10, s18, s19 ; CHECK-NEXT: vadd.f32 s9, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 s2, s3, s1 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vadd.f32 s6, s18, s19 ; CHECK-NEXT: vadd.f32 s5, s16, s17 +; CHECK-NEXT: vadd.f32 s2, s3, s1 ; CHECK-NEXT: vadd.f32 s4, s4, s14 -; CHECK-NEXT: vstr s0, [r1] -; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s12, s12, s11 -; CHECK-NEXT: adds r0, #7 ; CHECK-NEXT: vadd.f32 s10, s9, s10 -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: add.w r1, r2, r4, lsl #2 +; CHECK-NEXT: vstr s0, [r1] +; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s8, s8, s20 ; CHECK-NEXT: vadd.f32 s6, s5, s6 +; CHECK-NEXT: vstr s2, [r1] +; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #5 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s10, [r1] -; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #6 +; CHECK-NEXT: adds r0, #7 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add r9, r1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: add r8, r1 +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB6_2 ; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #72 +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -1345,105 +1310,95 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB7_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r11, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r11, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: add.w r9, r1, r11, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r5, r3, #2 +; CHECK-NEXT: lsl.w r5, r11, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: lsls r1, r3, #5 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: lsl.w r1, r11, #5 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB7_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB7_3 Depth 2 -; CHECK-NEXT: adds r1, r0, #7 -; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #3 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: add.w r8, r0, #2 -; CHECK-NEXT: adds r1, r0, #1 -; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: add.w r12, r0, #2 +; CHECK-NEXT: add.w r8, r0, #1 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov q7, q3 ; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: mov r10, r7 -; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill -; CHECK-NEXT: dls lr, r6 +; CHECK-NEXT: mov r10, r11 +; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB7_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r11, r3, r5 +; CHECK-NEXT: adds r6, r3, r5 ; CHECK-NEXT: vctp.32 r10 ; CHECK-NEXT: vpsttt -; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 +; CHECK-NEXT: vldrwt.u32 q0, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 ; CHECK-NEXT: vfmat.f32 q6, q1, q0 -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r11] +; CHECK-NEXT: vldrwt.u32 q1, [r6] ; CHECK-NEXT: vfmat.f32 q7, q1, q0 -; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [r6] +; CHECK-NEXT: vldrwt.u32 q1, [r7] ; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: adds r7, r6, r5 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [r7] -; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r6] +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q2, q1, q0 -; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov q2, q4 ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov q3, q5 ; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload -; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r6] +; CHECK-NEXT: vldrwt.u32 q1, [r7] ; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: sub.w r10, r10, #4 -; CHECK-NEXT: adds r6, r7, r5 +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q1, [r7] -; CHECK-NEXT: vfmat.f32 q4, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r6] +; CHECK-NEXT: vfmat.f32 q4, q1, q0 +; CHECK-NEXT: vldrwt.u32 q1, [r7] ; CHECK-NEXT: vfmat.f32 q5, q1, q0 -; CHECK-NEXT: add r6, r5 +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r6] ; CHECK-NEXT: vfmat.f32 q3, q1, q0 @@ -1451,7 +1406,7 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1 ; CHECK-NEXT: vadd.f32 s0, s30, s31 -; CHECK-NEXT: add.w r1, r2, r1, lsl #2 +; CHECK-NEXT: add.w r1, r2, r8, lsl #2 ; CHECK-NEXT: vadd.f32 s2, s28, s29 ; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vadd.f32 s5, s14, s15 @@ -1459,12 +1414,12 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vadd.f32 s6, s24, s25 ; CHECK-NEXT: vadd.f32 s14, s18, s19 ; CHECK-NEXT: vadd.f32 s7, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: vadd.f32 s13, s10, s11 ; CHECK-NEXT: vadd.f32 s10, s18, s19 ; CHECK-NEXT: vadd.f32 s9, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vadd.f32 s11, s18, s19 ; CHECK-NEXT: vadd.f32 s15, s16, s17 @@ -1475,35 +1430,35 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s8, s8, s13 -; CHECK-NEXT: adds r0, #8 ; CHECK-NEXT: vadd.f32 s14, s15, s11 ; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: add.w r1, r2, r8, lsl #2 +; CHECK-NEXT: add.w r1, r2, r12, lsl #2 ; CHECK-NEXT: vadd.f32 s1, s22, s23 ; CHECK-NEXT: vadd.f32 s3, s20, s21 ; CHECK-NEXT: vstr s10, [r1] ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vstr s14, [r1] -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: vadd.f32 s4, s3, s1 +; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 +; CHECK-NEXT: vadd.f32 s4, s3, s1 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #5 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #6 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #7 +; CHECK-NEXT: adds r0, #8 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add r12, r1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: add r9, r1 +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB7_2 ; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll index 0db0d4ebd43c2..9cde4c1cf8cba 100644 --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -432,24 +432,24 @@ define i32 @add_U320_without_i128_add(%struct.U320* nocapture dereferenceable(40 ; CHECK-NEXT: adcq %rdx, 8(%rdi) ; CHECK-NEXT: movq %rax, %rdx ; CHECK-NEXT: adcq %rcx, %rdx -; CHECK-NEXT: movq 24(%rdi), %r11 -; CHECK-NEXT: leaq (%r8,%r11), %r14 +; CHECK-NEXT: movq 24(%rdi), %r14 +; CHECK-NEXT: leaq (%r8,%r14), %r11 ; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: cmpq %r10, %rdx ; CHECK-NEXT: setb %bl ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: adcq %r14, %rbx -; CHECK-NEXT: movq 32(%rdi), %r10 -; CHECK-NEXT: leaq (%r9,%r10), %rcx +; CHECK-NEXT: adcq %r11, %rbx +; CHECK-NEXT: movq 32(%rdi), %rcx +; CHECK-NEXT: leaq (%r9,%rcx), %r10 ; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: cmpq %r14, %rbx +; CHECK-NEXT: cmpq %r11, %rbx ; CHECK-NEXT: setb %sil -; CHECK-NEXT: addq %r11, %r8 -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: addq %r14, %r8 +; CHECK-NEXT: adcq %r10, %rsi ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: cmpq %rcx, %rsi +; CHECK-NEXT: cmpq %r10, %rsi ; CHECK-NEXT: setb %al -; CHECK-NEXT: addq %r10, %r9 +; CHECK-NEXT: addq %rcx, %r9 ; CHECK-NEXT: movq %rdx, 16(%rdi) ; CHECK-NEXT: movq %rbx, 24(%rdi) ; CHECK-NEXT: movq %rsi, 32(%rdi) diff --git a/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll b/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll index fc3303f7a0c89..3f79a201bafeb 100644 --- a/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll +++ b/llvm/test/CodeGen/X86/callbr-asm-blockplacement.ll @@ -23,11 +23,11 @@ define i32 @foo(i32 %arg, i32 (i8*)* %arg3) nounwind { ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB0_5 ; CHECK-NEXT: # %bb.1: # %bb5 -; CHECK-NEXT: movq %rsi, %r14 +; CHECK-NEXT: movq %rsi, %r12 ; CHECK-NEXT: movslq %edi, %rbp ; CHECK-NEXT: leaq (,%rbp,8), %rax -; CHECK-NEXT: leaq global(%rax,%rax,2), %r15 -; CHECK-NEXT: leaq global+4(%rax,%rax,2), %r12 +; CHECK-NEXT: leaq global(%rax,%rax,2), %r14 +; CHECK-NEXT: leaq global+4(%rax,%rax,2), %r15 ; CHECK-NEXT: xorl %r13d, %r13d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_2: # %bb8 @@ -35,10 +35,10 @@ define i32 @foo(i32 %arg, i32 (i8*)* %arg3) nounwind { ; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: movq %rax, %rbx ; CHECK-NEXT: movq %rax, %rdi -; CHECK-NEXT: callq *%r14 -; CHECK-NEXT: movq %r15, %rdi +; CHECK-NEXT: callq *%r12 +; CHECK-NEXT: movq %r14, %rdi ; CHECK-NEXT: callq hoge@PLT -; CHECK-NEXT: movq %r12, %rdi +; CHECK-NEXT: movq %r15, %rdi ; CHECK-NEXT: callq hoge@PLT ; CHECK-NEXT: testb %r13b, %r13b ; CHECK-NEXT: jne .LBB0_2 diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll index 75439f8118607..17cab1b3f1d55 100644 --- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll +++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll @@ -136,8 +136,6 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: movl (%r15), %eax ; CHECK-NEXT: leal 8(,%rcx,8), %ecx ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: leaq 8(%r12), %rcx -; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: leaq 32(%r12), %rbx ; CHECK-NEXT: shlq $3, %r13 ; CHECK-NEXT: xorl %esi, %esi @@ -189,16 +187,17 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: jae .LBB1_7 ; CHECK-NEXT: # %bb.6: # %vector.memcheck ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload +; CHECK-NEXT: leaq 8(%r12), %rax +; CHECK-NEXT: addq %rax, %r10 ; CHECK-NEXT: leaq (%r10,%r11,8), %rax ; CHECK-NEXT: cmpq %rcx, %rax ; CHECK-NEXT: ja .LBB1_14 ; CHECK-NEXT: .LBB1_7: # %vector.body.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: leaq -4(%r8), %rax -; CHECK-NEXT: movq %rax, %r10 -; CHECK-NEXT: shrq $2, %r10 -; CHECK-NEXT: btl $2, %eax +; CHECK-NEXT: leaq -4(%r8), %r10 +; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: shrq $2, %rax +; CHECK-NEXT: btl $2, %r10d ; CHECK-NEXT: jb .LBB1_8 ; CHECK-NEXT: # %bb.9: # %vector.body.prol.preheader ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 @@ -207,12 +206,12 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: movdqu %xmm0, (%rdi,%r9,8) ; CHECK-NEXT: movdqu %xmm0, 16(%rdi,%r9,8) ; CHECK-NEXT: movl $4, %r11d -; CHECK-NEXT: testq %r10, %r10 +; CHECK-NEXT: testq %rax, %rax ; CHECK-NEXT: jne .LBB1_11 ; CHECK-NEXT: jmp .LBB1_13 ; CHECK-NEXT: .LBB1_8: # in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: xorl %r11d, %r11d -; CHECK-NEXT: testq %r10, %r10 +; CHECK-NEXT: testq %rax, %rax ; CHECK-NEXT: je .LBB1_13 ; CHECK-NEXT: .LBB1_11: # %vector.body.preheader.new ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 diff --git a/llvm/test/CodeGen/X86/inalloca-invoke.ll b/llvm/test/CodeGen/X86/inalloca-invoke.ll index 39a9ac5751f29..047a9acbbdcc5 100644 --- a/llvm/test/CodeGen/X86/inalloca-invoke.ll +++ b/llvm/test/CodeGen/X86/inalloca-invoke.ll @@ -24,7 +24,6 @@ blah: ; CHECK: pushl %eax ; CHECK: subl $20, %esp ; CHECK: movl %esp, %[[beg:[^ ]*]] -; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]] call void @begin(%Iter* sret(%Iter) %temp.lvalue) ; CHECK: calll _begin @@ -33,6 +32,7 @@ blah: to label %invoke.cont unwind label %lpad ; Uses end as sret param. +; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]] ; CHECK: pushl %[[end]] ; CHECK: calll _plus diff --git a/llvm/test/CodeGen/X86/licm-regpressure.ll b/llvm/test/CodeGen/X86/licm-regpressure.ll index 0ab655419c88c..c189142958e7a 100644 --- a/llvm/test/CodeGen/X86/licm-regpressure.ll +++ b/llvm/test/CodeGen/X86/licm-regpressure.ll @@ -1,10 +1,34 @@ ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -; This tests currently fails as MachineLICM does not compute register pressure +; RUN: llc < %s -mtriple=x86_64-linux -stop-after=early-machinelicm -o - | FileCheck %s -check-prefix=MIR + +; This tests should fail as MachineLICM does not compute register pressure ; correctly. More details: llvm.org/PR23143 + +; It however does not show any spills because leaq is rematerialized instead +; of spilling. + +; Stopping after MachineLICM however exposes all ADD64ri8 instructions +; to be hoisted which still has to be avoided. + ; XFAIL: * ; MachineLICM should take register pressure into account. -; CHECK-NOT: Spill +; CHECK-LABEL: {{^}}test: +; CHECK-NOT: Spill +; CHECK-COUNT-4: leaq +; CHECK-NOT: Spill +; CHECK: [[LOOP:\.LBB[0-9_]+]]: +; CHECK-NOT: Reload +; CHECK-COUNT-2: leaq +; CHECK-NOT: Reload +; CHECK: jne [[LOOP]] + +; MIR-LABEL: name: test +; MIR: bb.0.entry: +; MIR-COUNT-4: ADD64ri8 +; MIR: bb.1.loop-body: +; MIR-COUNT-2: ADD64ri8 +; MIR: JCC_1 %bb.1 %struct.A = type { i32, i32, i32, i32, i32, i32, i32 } diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll index f53b06eff0559..6a629e3515b00 100644 --- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll +++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll @@ -91,48 +91,48 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) { ; CHECK-NEXT: ## %bb.10: ## %do.end ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: xorl %r13d, %r13d -; CHECK-NEXT: testb %r13b, %r13b +; CHECK-NEXT: xorl %r12d, %r12d +; CHECK-NEXT: testb %r12b, %r12b ; CHECK-NEXT: jne LBB0_11 ; CHECK-NEXT: ## %bb.12: ## %while.body200.preheader -; CHECK-NEXT: xorl %r12d, %r12d +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: leaq LJTI0_0(%rip), %rdx -; CHECK-NEXT: leaq LJTI0_1(%rip), %rbx +; CHECK-NEXT: leaq LJTI0_1(%rip), %r13 ; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: jmp LBB0_13 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_20: ## %sw.bb256 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movl %r13d, %r14d +; CHECK-NEXT: movl %r12d, %r14d ; CHECK-NEXT: LBB0_21: ## %while.cond197.backedge ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 ; CHECK-NEXT: decl %r15d ; CHECK-NEXT: testl %r15d, %r15d -; CHECK-NEXT: movl %r14d, %r13d +; CHECK-NEXT: movl %r14d, %r12d ; CHECK-NEXT: jle LBB0_22 ; CHECK-NEXT: LBB0_13: ## %while.body200 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB0_29 Depth 2 ; CHECK-NEXT: ## Child Loop BB0_38 Depth 2 -; CHECK-NEXT: leal -268(%r13), %eax +; CHECK-NEXT: leal -268(%r12), %eax ; CHECK-NEXT: cmpl $105, %eax ; CHECK-NEXT: ja LBB0_14 ; CHECK-NEXT: ## %bb.56: ## %while.body200 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: movslq (%rbx,%rax,4), %rax -; CHECK-NEXT: addq %rbx, %rax +; CHECK-NEXT: movslq (%r13,%rax,4), %rax +; CHECK-NEXT: addq %r13, %rax ; CHECK-NEXT: jmpq *%rax ; CHECK-NEXT: LBB0_44: ## %while.cond1037.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: testb %r12b, %r12b -; CHECK-NEXT: movl %r13d, %r14d +; CHECK-NEXT: testb %bl, %bl +; CHECK-NEXT: movl %r12d, %r14d ; CHECK-NEXT: jne LBB0_21 ; CHECK-NEXT: jmp LBB0_55 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_14: ## %while.body200 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: leal 1(%r13), %eax +; CHECK-NEXT: leal 1(%r12), %eax ; CHECK-NEXT: cmpl $21, %eax ; CHECK-NEXT: ja LBB0_20 ; CHECK-NEXT: ## %bb.15: ## %while.body200 @@ -147,12 +147,12 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) { ; CHECK-NEXT: jmp LBB0_21 ; CHECK-NEXT: LBB0_26: ## %sw.bb474 ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: testb %r12b, %r12b +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: ## implicit-def: $rbp ; CHECK-NEXT: jne LBB0_34 ; CHECK-NEXT: ## %bb.27: ## %do.body479.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: testb %r12b, %r12b +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: ## implicit-def: $rbp ; CHECK-NEXT: jne LBB0_34 ; CHECK-NEXT: ## %bb.28: ## %land.rhs485.preheader @@ -163,7 +163,7 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) { ; CHECK-NEXT: LBB0_32: ## %do.body479.backedge ; CHECK-NEXT: ## in Loop: Header=BB0_29 Depth=2 ; CHECK-NEXT: leaq 1(%rbp), %rax -; CHECK-NEXT: testb %r12b, %r12b +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: je LBB0_33 ; CHECK-NEXT: LBB0_29: ## %land.rhs485 ; CHECK-NEXT: ## Parent Loop BB0_13 Depth=1 @@ -173,13 +173,13 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) { ; CHECK-NEXT: ## %bb.30: ## %cond.true.i.i2780 ; CHECK-NEXT: ## in Loop: Header=BB0_29 Depth=2 ; CHECK-NEXT: movq %rax, %rbp -; CHECK-NEXT: testb %r12b, %r12b +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_32 ; CHECK-NEXT: ## %bb.31: ## %lor.rhs500 ; CHECK-NEXT: ## in Loop: Header=BB0_29 Depth=2 ; CHECK-NEXT: movl $256, %esi ## imm = 0x100 ; CHECK-NEXT: callq ___maskrune -; CHECK-NEXT: testb %r12b, %r12b +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_32 ; CHECK-NEXT: jmp LBB0_34 ; CHECK-NEXT: LBB0_45: ## %sw.bb1134 @@ -229,13 +229,13 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) { ; CHECK-NEXT: LBB0_38: ## %for.cond534 ; CHECK-NEXT: ## Parent Loop BB0_13 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: testb %r12b, %r12b +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: jne LBB0_38 ; CHECK-NEXT: ## %bb.39: ## %for.cond542.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_13 Depth=1 -; CHECK-NEXT: testb %r12b, %r12b +; CHECK-NEXT: testb %bl, %bl ; CHECK-NEXT: movb $0, (%rbp) -; CHECK-NEXT: movl %r13d, %r14d +; CHECK-NEXT: movl %r12d, %r14d ; CHECK-NEXT: leaq LJTI0_0(%rip), %rdx ; CHECK-NEXT: jmp LBB0_21 ; CHECK-NEXT: .p2align 4, 0x90 diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll index a5471415e6e6d..b6a2e641fd65b 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -276,8 +276,6 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: decq %rax -; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill ; X64-NEXT: testq %rbx, %rbx ; X64-NEXT: sets %al ; X64-NEXT: testq %r12, %r12 @@ -291,7 +289,8 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: testb %bpl, %al -; X64-NEXT: cmovneq (%rsp), %r13 # 8-byte Folded Reload +; X64-NEXT: leaq -1(%r13), %rax +; X64-NEXT: cmovneq %rax, %r13 ; X64-NEXT: movq %r13, %rax ; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx From 93d08acaacec951dbb302f77eeae51974985b6b2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Zakharin Date: Thu, 12 Aug 2021 17:23:28 -0700 Subject: [PATCH 145/700] [clang-offload-wrapper] Add standard notes for ELF offload images The patch adds ELF notes into SHT_NOTE sections of ELF offload images passed to clang-offload-wrapper. The new notes use a null-terminated "LLVMOMPOFFLOAD" note name. There are currently three types of notes: VERSION: a string (not null-terminated) representing the ELF offload image structure. The current version '1.0' does not put any restrictions on the structure of the image. If we ever need to come up with a common structure for ELF offload images (e.g. to be able to analyze the images in libomptarget in some standard way), then we will introduce new versions. PRODUCER: a vendor specific name of the producing toolchain. Upstream LLVM uses "LLVM" (not null-terminated). PRODUCER_VERSION: a vendor specific version of the producing toolchain. Upstream LLVM uses LLVM_VERSION_STRING with optional LLVM_REVISION. All three notes are not mandatory currently. Differential Revision: https://reviews.llvm.org/D99551 --- .../Driver/Inputs/empty-elf-template.yaml | 5 + clang/test/Driver/clang-offload-wrapper.c | 24 +- .../clang-offload-wrapper/CMakeLists.txt | 2 +- .../ClangOffloadWrapper.cpp | 294 +++++++++++++++++- 4 files changed, 318 insertions(+), 7 deletions(-) create mode 100644 clang/test/Driver/Inputs/empty-elf-template.yaml diff --git a/clang/test/Driver/Inputs/empty-elf-template.yaml b/clang/test/Driver/Inputs/empty-elf-template.yaml new file mode 100644 index 0000000000000..f77de07a430f6 --- /dev/null +++ b/clang/test/Driver/Inputs/empty-elf-template.yaml @@ -0,0 +1,5 @@ +--- !ELF +FileHeader: + Class: ELFCLASS[[BITS]] + Data: ELFDATA2[[ENCODING]] + Type: ET_REL diff --git a/clang/test/Driver/clang-offload-wrapper.c b/clang/test/Driver/clang-offload-wrapper.c index 9a36559e34dd7..c671d88209744 100644 --- a/clang/test/Driver/clang-offload-wrapper.c +++ b/clang/test/Driver/clang-offload-wrapper.c @@ -19,9 +19,10 @@ // // Check bitcode produced by the wrapper tool. // -// RUN: clang-offload-wrapper -target=x86_64-pc-linux-gnu -o %t.wrapper.bc %t.tgt +// RUN: clang-offload-wrapper -target=x86_64-pc-linux-gnu -o %t.wrapper.bc %t.tgt 2>&1 | FileCheck %s --check-prefix ELF-WARNING // RUN: llvm-dis %t.wrapper.bc -o - | FileCheck %s --check-prefix CHECK-IR +// ELF-WARNING: is not an ELF image, so notes cannot be added to it. // CHECK-IR: target triple = "x86_64-pc-linux-gnu" // CHECK-IR-DAG: [[ENTTY:%.+]] = type { i8*, i8*, i{{32|64}}, i32, i32 } @@ -53,3 +54,24 @@ // CHECK-IR: ret void // CHECK-IR: declare void @__tgt_unregister_lib([[DESCTY]]*) + +// Check that clang-offload-wrapper adds LLVMOMPOFFLOAD notes +// into the ELF offload images: +// RUN: yaml2obj %S/Inputs/empty-elf-template.yaml -o %t.64le -DBITS=64 -DENCODING=LSB +// RUN: clang-offload-wrapper -target=x86_64-pc-linux-gnu -o %t.wrapper.elf64le.bc %t.64le +// RUN: llvm-dis %t.wrapper.elf64le.bc -o - | FileCheck %s --check-prefix OMPNOTES +// RUN: yaml2obj %S/Inputs/empty-elf-template.yaml -o %t.64be -DBITS=64 -DENCODING=MSB +// RUN: clang-offload-wrapper -target=x86_64-pc-linux-gnu -o %t.wrapper.elf64be.bc %t.64be +// RUN: llvm-dis %t.wrapper.elf64be.bc -o - | FileCheck %s --check-prefix OMPNOTES +// RUN: yaml2obj %S/Inputs/empty-elf-template.yaml -o %t.32le -DBITS=32 -DENCODING=LSB +// RUN: clang-offload-wrapper -target=x86_64-pc-linux-gnu -o %t.wrapper.elf32le.bc %t.32le +// RUN: llvm-dis %t.wrapper.elf32le.bc -o - | FileCheck %s --check-prefix OMPNOTES +// RUN: yaml2obj %S/Inputs/empty-elf-template.yaml -o %t.32be -DBITS=32 -DENCODING=MSB +// RUN: clang-offload-wrapper -target=x86_64-pc-linux-gnu -o %t.wrapper.elf32be.bc %t.32be +// RUN: llvm-dis %t.wrapper.elf32be.bc -o - | FileCheck %s --check-prefix OMPNOTES + +// There is no clean way for extracting the offload image +// from the object file currently, so try to find +// the inserted ELF notes in the device image variable's +// initializer: +// OMPNOTES: @{{.+}} = internal unnamed_addr constant [{{[0-9]+}} x i8] c"{{.*}}LLVMOMPOFFLOAD{{.*}}LLVMOMPOFFLOAD{{.*}}LLVMOMPOFFLOAD{{.*}}" diff --git a/clang/tools/clang-offload-wrapper/CMakeLists.txt b/clang/tools/clang-offload-wrapper/CMakeLists.txt index 8bcb46267a37c..144edf5ab60c0 100644 --- a/clang/tools/clang-offload-wrapper/CMakeLists.txt +++ b/clang/tools/clang-offload-wrapper/CMakeLists.txt @@ -1,4 +1,4 @@ -set(LLVM_LINK_COMPONENTS BitWriter Core Support TransformUtils) +set(LLVM_LINK_COMPONENTS BitWriter Core Object Support TransformUtils) add_clang_tool(clang-offload-wrapper ClangOffloadWrapper.cpp diff --git a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp index fff12716b6f0d..bbadd909089e3 100644 --- a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp +++ b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp @@ -17,27 +17,37 @@ #include "clang/Basic/Version.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IR/Constants.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/ObjectFile.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/EndianStream.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" #include "llvm/Support/Signals.h" #include "llvm/Support/ToolOutputFile.h" +#include "llvm/Support/VCSRevision.h" #include "llvm/Support/WithColor.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include #include +#define OPENMP_OFFLOAD_IMAGE_VERSION "1.0" + using namespace llvm; +using namespace llvm::object; static cl::opt Help("h", cl::desc("Alias for -help"), cl::Hidden); @@ -60,6 +70,12 @@ static cl::opt cl::desc("Target triple for the output module"), cl::value_desc("triple"), cl::cat(ClangOffloadWrapperCategory)); +static cl::opt SaveTemps( + "save-temps", + cl::desc("Save temporary files that may be produced by the tool. " + "This option forces print-out of the temporary files' names."), + cl::Hidden); + namespace { class BinaryWrapper { @@ -70,6 +86,15 @@ class BinaryWrapper { StructType *ImageTy = nullptr; StructType *DescTy = nullptr; + std::string ToolName; + std::string ObjcopyPath; + // Temporary file names that may be created during adding notes + // to ELF offload images. Use -save-temps to keep them and also + // see their names. A temporary file's name includes the name + // of the original input ELF image, so you can easily match + // them, if you have multiple inputs. + std::vector TempFiles; + private: IntegerType *getSizeTTy() { switch (M.getDataLayout().getPointerTypeSize(Type::getInt8PtrTy(C))) { @@ -294,8 +319,61 @@ class BinaryWrapper { } public: - BinaryWrapper(StringRef Target) : M("offload.wrapper.object", C) { + BinaryWrapper(StringRef Target, StringRef ToolName) + : M("offload.wrapper.object", C), ToolName(ToolName) { M.setTargetTriple(Target); + // Look for llvm-objcopy in the same directory, from which + // clang-offload-wrapper is invoked. This helps OpenMP offload + // LIT tests. + + // This just needs to be some symbol in the binary; C++ doesn't + // allow taking the address of ::main however. + void *P = (void *)(intptr_t)&Help; + std::string COWPath = sys::fs::getMainExecutable(ToolName.str().c_str(), P); + if (!COWPath.empty()) { + auto COWDir = sys::path::parent_path(COWPath); + ErrorOr ObjcopyPathOrErr = + sys::findProgramByName("llvm-objcopy", {COWDir}); + if (ObjcopyPathOrErr) { + ObjcopyPath = *ObjcopyPathOrErr; + return; + } + + // Otherwise, look through PATH environment. + } + + ErrorOr ObjcopyPathOrErr = + sys::findProgramByName("llvm-objcopy"); + if (!ObjcopyPathOrErr) { + WithColor::warning(errs(), ToolName) + << "cannot find llvm-objcopy[.exe] in PATH; ELF notes cannot be " + "added.\n"; + return; + } + + ObjcopyPath = *ObjcopyPathOrErr; + } + + ~BinaryWrapper() { + if (TempFiles.empty()) + return; + + StringRef ToolNameRef(ToolName); + auto warningOS = [ToolNameRef]() -> raw_ostream & { + return WithColor::warning(errs(), ToolNameRef); + }; + + for (auto &F : TempFiles) { + if (SaveTemps) { + warningOS() << "keeping temporary file " << F << "\n"; + continue; + } + + auto EC = sys::fs::remove(F, false); + if (EC) + warningOS() << "cannot remove temporary file " << F << ": " + << EC.message().c_str() << "\n"; + } } const Module &wrapBinaries(ArrayRef> Binaries) { @@ -305,6 +383,205 @@ class BinaryWrapper { createUnregisterFunction(Desc); return M; } + + std::unique_ptr addELFNotes(std::unique_ptr Buf, + StringRef OriginalFileName) { + // Cannot add notes, if llvm-objcopy is not available. + // + // I did not find a clean way to add a new notes section into an existing + // ELF file. llvm-objcopy seems to recreate a new ELF from scratch, + // and we just try to use llvm-objcopy here. + if (ObjcopyPath.empty()) + return Buf; + + StringRef ToolNameRef(ToolName); + + // Helpers to emit warnings. + auto warningOS = [ToolNameRef]() -> raw_ostream & { + return WithColor::warning(errs(), ToolNameRef); + }; + auto handleErrorAsWarning = [&warningOS](Error E) { + logAllUnhandledErrors(std::move(E), warningOS()); + }; + + Expected> BinOrErr = + ObjectFile::createELFObjectFile(Buf->getMemBufferRef(), + /*InitContent=*/false); + if (Error E = BinOrErr.takeError()) { + consumeError(std::move(E)); + // This warning is questionable, but let it be here, + // assuming that most OpenMP offload models use ELF offload images. + warningOS() << OriginalFileName + << " is not an ELF image, so notes cannot be added to it.\n"; + return Buf; + } + + // If we fail to add the note section, we just pass through the original + // ELF image for wrapping. At some point we should enforce the note section + // and start emitting errors vs warnings. + support::endianness Endianness; + if (isa(BinOrErr->get()) || + isa(BinOrErr->get())) { + Endianness = support::little; + } else if (isa(BinOrErr->get()) || + isa(BinOrErr->get())) { + Endianness = support::big; + } else { + warningOS() << OriginalFileName + << " is an ELF image of unrecognized format.\n"; + return Buf; + } + + // Create temporary file for the data of a new SHT_NOTE section. + // We fill it in with data and then pass to llvm-objcopy invocation + // for reading. + Twine NotesFileModel = OriginalFileName + Twine(".elfnotes.%%%%%%%.tmp"); + Expected NotesTemp = + sys::fs::TempFile::create(NotesFileModel); + if (Error E = NotesTemp.takeError()) { + handleErrorAsWarning(createFileError(NotesFileModel, std::move(E))); + return Buf; + } + TempFiles.push_back(NotesTemp->TmpName); + + // Create temporary file for the updated ELF image. + // This is an empty file that we pass to llvm-objcopy invocation + // for writing. + Twine ELFFileModel = OriginalFileName + Twine(".elfwithnotes.%%%%%%%.tmp"); + Expected ELFTemp = + sys::fs::TempFile::create(ELFFileModel); + if (Error E = ELFTemp.takeError()) { + handleErrorAsWarning(createFileError(ELFFileModel, std::move(E))); + return Buf; + } + TempFiles.push_back(ELFTemp->TmpName); + + // Keep the new ELF image file to reserve the name for the future + // llvm-objcopy invocation. + std::string ELFTmpFileName = ELFTemp->TmpName; + if (Error E = ELFTemp->keep(ELFTmpFileName)) { + handleErrorAsWarning(createFileError(ELFTmpFileName, std::move(E))); + return Buf; + } + + // Write notes to the *elfnotes*.tmp file. + raw_fd_ostream NotesOS(NotesTemp->FD, false); + + struct NoteTy { + // Note name is a null-terminated "LLVMOMPOFFLOAD". + std::string Name; + // Note type defined in llvm/include/llvm/BinaryFormat/ELF.h. + uint32_t Type = 0; + // Each note has type-specific associated data. + std::string Desc; + + NoteTy(std::string &&Name, uint32_t Type, std::string &&Desc) + : Name(std::move(Name)), Type(Type), Desc(std::move(Desc)) {} + }; + + // So far we emit just three notes. + SmallVector Notes; + // Version of the offload image identifying the structure of the ELF image. + // Version 1.0 does not have any specific requirements. + // We may come up with some structure that has to be honored by all + // offload implementations in future (e.g. to let libomptarget + // get some information from the offload image). + Notes.emplace_back("LLVMOMPOFFLOAD", ELF::NT_LLVM_OPENMP_OFFLOAD_VERSION, + OPENMP_OFFLOAD_IMAGE_VERSION); + // This is a producer identification string. We are LLVM! + Notes.emplace_back("LLVMOMPOFFLOAD", ELF::NT_LLVM_OPENMP_OFFLOAD_PRODUCER, + "LLVM"); + // This is a producer version. Use the same format that is used + // by clang to report the LLVM version. + Notes.emplace_back("LLVMOMPOFFLOAD", + ELF::NT_LLVM_OPENMP_OFFLOAD_PRODUCER_VERSION, + LLVM_VERSION_STRING +#ifdef LLVM_REVISION + " " LLVM_REVISION +#endif + ); + + // Return the amount of padding required for a blob of N bytes + // to be aligned to Alignment bytes. + auto getPadAmount = [](uint32_t N, uint32_t Alignment) -> uint32_t { + uint32_t Mod = (N % Alignment); + if (Mod == 0) + return 0; + return Alignment - Mod; + }; + auto emitPadding = [&getPadAmount](raw_ostream &OS, uint32_t Size) { + for (uint32_t I = 0; I < getPadAmount(Size, 4); ++I) + OS << '\0'; + }; + + // Put notes into the file. + for (auto &N : Notes) { + assert(!N.Name.empty() && "We should not create notes with empty names."); + // Name must be null-terminated. + if (N.Name.back() != '\0') + N.Name += '\0'; + uint32_t NameSz = N.Name.size(); + uint32_t DescSz = N.Desc.size(); + // A note starts with three 4-byte values: + // NameSz + // DescSz + // Type + // These three fields are endian-sensitive. + support::endian::write(NotesOS, NameSz, Endianness); + support::endian::write(NotesOS, DescSz, Endianness); + support::endian::write(NotesOS, N.Type, Endianness); + // Next, we have a null-terminated Name padded to a 4-byte boundary. + NotesOS << N.Name; + emitPadding(NotesOS, NameSz); + if (DescSz == 0) + continue; + // Finally, we have a descriptor, which is an arbitrary flow of bytes. + NotesOS << N.Desc; + emitPadding(NotesOS, DescSz); + } + NotesOS.flush(); + + // Keep the notes file. + std::string NotesTmpFileName = NotesTemp->TmpName; + if (Error E = NotesTemp->keep(NotesTmpFileName)) { + handleErrorAsWarning(createFileError(NotesTmpFileName, std::move(E))); + return Buf; + } + + // Run llvm-objcopy like this: + // llvm-objcopy --add-section=.note.openmp= \ + // + // + // This will add a SHT_NOTE section on top of the original ELF. + std::vector Args; + Args.push_back(ObjcopyPath); + std::string Option("--add-section=.note.openmp=" + NotesTmpFileName); + Args.push_back(Option); + Args.push_back(OriginalFileName); + Args.push_back(ELFTmpFileName); + bool ExecutionFailed = false; + std::string ErrMsg; + (void)sys::ExecuteAndWait(ObjcopyPath, Args, + /*Env=*/llvm::None, /*Redirects=*/{}, + /*SecondsToWait=*/0, + /*MemoryLimit=*/0, &ErrMsg, &ExecutionFailed); + + if (ExecutionFailed) { + warningOS() << ErrMsg << "\n"; + return Buf; + } + + // Substitute the original ELF with new one. + ErrorOr> BufOrErr = + MemoryBuffer::getFile(ELFTmpFileName); + if (!BufOrErr) { + handleErrorAsWarning( + createFileError(ELFTmpFileName, BufOrErr.getError())); + return Buf; + } + + return std::move(*BufOrErr); + } }; } // anonymous namespace @@ -338,6 +615,8 @@ int main(int argc, const char **argv) { return 1; } + BinaryWrapper Wrapper(Target, argv[0]); + // Read device binaries. SmallVector, 4u> Buffers; SmallVector, 4u> Images; @@ -350,8 +629,13 @@ int main(int argc, const char **argv) { reportError(createFileError(File, BufOrErr.getError())); return 1; } + std::unique_ptr Buffer(std::move(*BufOrErr)); + if (File != "-") { + // Adding ELF notes for STDIN is not supported yet. + Buffer = Wrapper.addELFNotes(std::move(Buffer), File); + } const std::unique_ptr &Buf = - Buffers.emplace_back(std::move(*BufOrErr)); + Buffers.emplace_back(std::move(Buffer)); Images.emplace_back(Buf->getBufferStart(), Buf->getBufferSize()); } @@ -364,9 +648,9 @@ int main(int argc, const char **argv) { } // Create a wrapper for device binaries and write its bitcode to the file. - WriteBitcodeToFile(BinaryWrapper(Target).wrapBinaries( - makeArrayRef(Images.data(), Images.size())), - Out.os()); + WriteBitcodeToFile( + Wrapper.wrapBinaries(makeArrayRef(Images.data(), Images.size())), + Out.os()); if (Out.os().has_error()) { reportError(createFileError(Output, Out.os().error())); return 1; From 8c23669eeb189a94aa6caacf8e1206175586aaad Mon Sep 17 00:00:00 2001 From: Evgenii Stepanov Date: Fri, 13 Aug 2021 17:15:52 -0700 Subject: [PATCH 146/700] [hwasan] Ignore lit config.enable_aliases on non-x86. This re-enables a number of Android tests that have been lost in check-hwasan. Differential Revision: https://reviews.llvm.org/D108064 --- compiler-rt/test/hwasan/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/hwasan/lit.cfg.py b/compiler-rt/test/hwasan/lit.cfg.py index c94d5e0dac83d..dcae2b2932815 100644 --- a/compiler-rt/test/hwasan/lit.cfg.py +++ b/compiler-rt/test/hwasan/lit.cfg.py @@ -15,7 +15,7 @@ if config.target_arch == 'x86_64' and config.enable_aliases == '1': clang_hwasan_common_cflags += ["-fsanitize-hwaddress-experimental-aliasing"] -if config.enable_aliases != '1': +else: config.available_features.add('pointer-tagging') if config.target_arch == 'x86_64': # This does basically the same thing as tagged-globals on aarch64. Because From 65bc8ba1a2d434ba72f6f476bfd4a9ae69cb16b2 Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Mon, 16 Aug 2021 13:40:07 -0700 Subject: [PATCH 147/700] Correctly update reproducer hooks for SB API I added In 9ea6dd5cfac0b233fbb148c1e2d0f81f816737c8 / https://reviews.llvm.org/D88387 where I added skinny corefile creation, I added new SB API and tried to manually update the hooks for the reproducers. I missed a spot, and I should have used lldb-instr to update the instrumentation automatically. --- lldb/source/API/SBMemoryRegionInfo.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/lldb/source/API/SBMemoryRegionInfo.cpp b/lldb/source/API/SBMemoryRegionInfo.cpp index ab74d559387fb..d924a31cca0d4 100644 --- a/lldb/source/API/SBMemoryRegionInfo.cpp +++ b/lldb/source/API/SBMemoryRegionInfo.cpp @@ -116,14 +116,14 @@ const char *SBMemoryRegionInfo::GetName() { return m_opaque_up->GetName().AsCString(); } -bool SBMemoryRegionInfo::HasDirtyMemoryPageList() { - LLDB_RECORD_METHOD_NO_ARGS(bool, SBMemoryRegionInfo, HasDirtyMemoryPageList); +bool SBMemoryRegionInfo::HasDirtyMemoryPageList() {LLDB_RECORD_METHOD_NO_ARGS(bool, SBMemoryRegionInfo, HasDirtyMemoryPageList); + return m_opaque_up->GetDirtyPageList().hasValue(); } -uint32_t SBMemoryRegionInfo::GetNumDirtyPages() { - LLDB_RECORD_METHOD_NO_ARGS(uint32_t, SBMemoryRegionInfo, GetNumDirtyPages); +uint32_t SBMemoryRegionInfo::GetNumDirtyPages() {LLDB_RECORD_METHOD_NO_ARGS(uint32_t, SBMemoryRegionInfo, GetNumDirtyPages); + uint32_t num_dirty_pages = 0; llvm::Optional> dirty_page_list = @@ -134,9 +134,8 @@ uint32_t SBMemoryRegionInfo::GetNumDirtyPages() { return num_dirty_pages; } -addr_t SBMemoryRegionInfo::GetDirtyPageAddressAtIndex(uint32_t idx) { - LLDB_RECORD_METHOD(addr_t, SBMemoryRegionInfo, GetDirtyPageAddressAtIndex, - (uint32_t), idx); +addr_t SBMemoryRegionInfo::GetDirtyPageAddressAtIndex(uint32_t idx) {LLDB_RECORD_METHOD(lldb::addr_t, SBMemoryRegionInfo, GetDirtyPageAddressAtIndex, (uint32_t), idx); + addr_t dirty_page_addr = LLDB_INVALID_ADDRESS; const llvm::Optional> &dirty_page_list = @@ -147,8 +146,9 @@ addr_t SBMemoryRegionInfo::GetDirtyPageAddressAtIndex(uint32_t idx) { return dirty_page_addr; } -int SBMemoryRegionInfo::GetPageSize() { - LLDB_RECORD_METHOD_NO_ARGS(int, SBMemoryRegionInfo, GetPageSize); +int SBMemoryRegionInfo::GetPageSize() {LLDB_RECORD_METHOD_NO_ARGS(int, SBMemoryRegionInfo, GetPageSize); + + return m_opaque_up->GetPageSize(); } @@ -196,6 +196,10 @@ void RegisterMethods(Registry &R) { LLDB_REGISTER_METHOD(const char *, SBMemoryRegionInfo, GetName, ()); LLDB_REGISTER_METHOD(bool, SBMemoryRegionInfo, GetDescription, (lldb::SBStream &)); + LLDB_REGISTER_METHOD(bool, SBMemoryRegionInfo, HasDirtyMemoryPageList, ()); + LLDB_REGISTER_METHOD(uint32_t, SBMemoryRegionInfo, GetNumDirtyPages, ()); + LLDB_REGISTER_METHOD(lldb::addr_t, SBMemoryRegionInfo, GetDirtyPageAddressAtIndex, (uint32_t)); + LLDB_REGISTER_METHOD(int, SBMemoryRegionInfo, GetPageSize, ()); } } From 5bab1f095270c25800d17ae6fc8c0ed1375fd306 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 16 Aug 2021 13:36:16 -0700 Subject: [PATCH 148/700] [ARM][TypePromotion] Re-generate test checks. NFC Tests were missing load/store alignment. One test in casts.ll had no check lines. --- .../Transforms/TypePromotion/ARM/calls.ll | 2 +- .../Transforms/TypePromotion/ARM/casts.ll | 20 ++++++++++++++----- .../Transforms/TypePromotion/ARM/icmps.ll | 8 ++++---- .../Transforms/TypePromotion/ARM/phis-ret.ll | 2 +- .../Transforms/TypePromotion/ARM/pointers.ll | 4 ++-- .../Transforms/TypePromotion/ARM/signed.ll | 2 +- 6 files changed, 24 insertions(+), 14 deletions(-) diff --git a/llvm/test/Transforms/TypePromotion/ARM/calls.ll b/llvm/test/Transforms/TypePromotion/ARM/calls.ll index cd273c06150f5..9b140b029beeb 100644 --- a/llvm/test/Transforms/TypePromotion/ARM/calls.ll +++ b/llvm/test/Transforms/TypePromotion/ARM/calls.ll @@ -169,7 +169,7 @@ define hidden i32 @call_return_pointer(i8 zeroext %p_13) local_unnamed_addr #0 { ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8 ; CHECK-NEXT: [[CONV1:%.*]] = zext i8 [[TMP1]] to i16 ; CHECK-NEXT: [[CALL:%.*]] = tail call i16** @func_62(i8 zeroext undef, i32 undef, i16 signext [[CONV1]], i32* undef) -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @g_893, i32 0, i32 0), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_ANON:%.*]], %struct.anon* @g_893, i32 0, i32 0), align 4 ; CHECK-NEXT: [[CONV2:%.*]] = trunc i32 [[TMP2]] to i16 ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: diff --git a/llvm/test/Transforms/TypePromotion/ARM/casts.ll b/llvm/test/Transforms/TypePromotion/ARM/casts.ll index 70fa617115e86..7cd9cba0b7097 100644 --- a/llvm/test/Transforms/TypePromotion/ARM/casts.ll +++ b/llvm/test/Transforms/TypePromotion/ARM/casts.ll @@ -32,7 +32,7 @@ define i8 @trunc_i16_i8(i16* %ptr, i16 zeroext %arg0, i8 zeroext %arg1) { ; CHECK-LABEL: @trunc_i16_i8( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[ARG1:%.*]] to i32 -; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[PTR:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[PTR:%.*]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = add i16 [[TMP1]], [[ARG0:%.*]] ; CHECK-NEXT: [[TMP3:%.*]] = trunc i16 [[TMP2]] to i8 ; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 @@ -132,7 +132,7 @@ entry: define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) { ; CHECK-LABEL: @or_icmp_ugt( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[PTR:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[PTR:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[TMP2]], 1 @@ -246,6 +246,16 @@ exit: } define i16 @bitcast_i16(i16 zeroext %arg0, i16 zeroext %arg1) { +; CHECK-LABEL: @bitcast_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[ARG0:%.*]] to i32 +; CHECK-NEXT: [[CAST:%.*]] = bitcast i16 12345 to i16 +; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[CAST]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add nuw i32 [[TMP0]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ule i32 [[ADD]], [[TMP1]] +; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP]], i16 [[ARG1:%.*]], i16 32657 +; CHECK-NEXT: ret i16 [[RES]] +; entry: %cast = bitcast i16 12345 to i16 %add = add nuw i16 %arg0, 1 @@ -518,7 +528,7 @@ define i8 @search_through_zext_load(i8* %a, i8 zeroext %b, i16 zeroext %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[C:%.*]] to i32 -; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8* [[A:%.*]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8* [[A:%.*]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[LOAD]] to i32 ; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i32 [[TMP2]], [[TMP1]] ; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] @@ -631,7 +641,7 @@ define i16 @trunc_sink_less_than_store(i16 zeroext %a, i16 zeroext %b, i16 zeroe ; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[SUB]], 255 ; CHECK-NEXT: [[ADD:%.*]] = add nuw i32 [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[ADD]] to i8 -; CHECK-NEXT: store i8 [[TMP5]], i8* [[E:%.*]] +; CHECK-NEXT: store i8 [[TMP5]], i8* [[E:%.*]], align 1 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUB]], [[IF_THEN]] ] @@ -981,7 +991,7 @@ entry: define i32 @replace_trunk_with_mask(i16* %a) { ; CHECK-LABEL: @replace_trunk_with_mask( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[A:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[A:%.*]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[TMP0]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]] diff --git a/llvm/test/Transforms/TypePromotion/ARM/icmps.ll b/llvm/test/Transforms/TypePromotion/ARM/icmps.ll index 6dda15c309b4a..f5cf2bc43681c 100644 --- a/llvm/test/Transforms/TypePromotion/ARM/icmps.ll +++ b/llvm/test/Transforms/TypePromotion/ARM/icmps.ll @@ -168,8 +168,8 @@ define void @store_dsp_res(i8* %in, i8* %out, i8 %compare) { ; CHECK-LABEL: @store_dsp_res( ; CHECK-NEXT: [[FIRST:%.*]] = getelementptr inbounds i8, i8* [[IN:%.*]], i32 0 ; CHECK-NEXT: [[SECOND:%.*]] = getelementptr inbounds i8, i8* [[IN]], i32 1 -; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[FIRST]] -; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[SECOND]] +; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[FIRST]], align 1 +; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[SECOND]], align 1 ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[LD0]], -1 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[COMPARE:%.*]], [[LD1]] ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i8 [[COMPARE]], i8 [[XOR]] @@ -253,7 +253,7 @@ define i32 @icmp_not(i16 zeroext %arg0, i16 zeroext %arg1) { define i32 @icmp_i1(i1* %arg0, i1 zeroext %arg1, i32 %a, i32 %b) { ; CHECK-LABEL: @icmp_i1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[LOAD:%.*]] = load i1, i1* [[ARG0:%.*]] +; CHECK-NEXT: [[LOAD:%.*]] = load i1, i1* [[ARG0:%.*]], align 1 ; CHECK-NEXT: [[NOT:%.*]] = xor i1 [[LOAD]], true ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i1 [[ARG1:%.*]], [[NOT]] ; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 [[A:%.*]], i32 [[B:%.*]] @@ -271,7 +271,7 @@ define i32 @icmp_i7(i7* %arg0, i7 zeroext %arg1, i32 %a, i32 %b) { ; CHECK-LABEL: @icmp_i7( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = zext i7 [[ARG1:%.*]] to i32 -; CHECK-NEXT: [[LOAD:%.*]] = load i7, i7* [[ARG0:%.*]] +; CHECK-NEXT: [[LOAD:%.*]] = load i7, i7* [[ARG0:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i7 [[LOAD]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw i32 [[TMP1]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP0]], [[ADD]] diff --git a/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll b/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll index e79e4ff1bdb2e..8659674bb9750 100644 --- a/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll +++ b/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll @@ -278,7 +278,7 @@ define i16 @promote_arg_return(i16 zeroext %arg1, i16 zeroext %arg2, i8* %res) { ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[ADD]], 3 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[MUL]], [[TMP2]] ; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i8 -; CHECK-NEXT: store i8 [[CONV]], i8* [[RES:%.*]] +; CHECK-NEXT: store i8 [[CONV]], i8* [[RES:%.*]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP1]] to i16 ; CHECK-NEXT: ret i16 [[TMP3]] ; diff --git a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll index 3c5f097b1b92b..3e37550186e69 100644 --- a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll +++ b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll @@ -130,8 +130,8 @@ define i8 @call_pointer(i8 zeroext %x, i8 zeroext %y, i16* %a, i16* %b) { define i16 @pointer_to_pointer(i16** %arg, i16 zeroext %limit) { ; CHECK-LABEL: @pointer_to_pointer( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ADDR:%.*]] = load i16*, i16** [[ARG:%.*]] -; CHECK-NEXT: [[VAL:%.*]] = load i16, i16* [[ADDR]] +; CHECK-NEXT: [[ADDR:%.*]] = load i16*, i16** [[ARG:%.*]], align 8 +; CHECK-NEXT: [[VAL:%.*]] = load i16, i16* [[ADDR]], align 2 ; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[VAL]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw i32 [[TMP0]], 7 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], 256 diff --git a/llvm/test/Transforms/TypePromotion/ARM/signed.ll b/llvm/test/Transforms/TypePromotion/ARM/signed.ll index 143220a53b5c2..fb60a3f101f7d 100644 --- a/llvm/test/Transforms/TypePromotion/ARM/signed.ll +++ b/llvm/test/Transforms/TypePromotion/ARM/signed.ll @@ -4,7 +4,7 @@ ; Test to check that ARMCodeGenPrepare doesn't optimised away sign extends. define i16 @test_signed_load(i16* %ptr) { ; CHECK-LABEL: @test_signed_load( -; CHECK-NEXT: [[LOAD:%.*]] = load i16, i16* [[PTR:%.*]] +; CHECK-NEXT: [[LOAD:%.*]] = load i16, i16* [[PTR:%.*]], align 2 ; CHECK-NEXT: [[CONV0:%.*]] = zext i16 [[LOAD]] to i32 ; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[LOAD]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[CONV0]], [[CONV1]] From aa575ed918632358b65da3e161d55254092b7416 Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Mon, 16 Aug 2021 13:46:12 -0700 Subject: [PATCH 149/700] Ah, fix formatting, I didn't notice lldb-instr's code additions were unusually formatted. --- lldb/source/API/SBMemoryRegionInfo.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lldb/source/API/SBMemoryRegionInfo.cpp b/lldb/source/API/SBMemoryRegionInfo.cpp index d924a31cca0d4..c0f5456223da5 100644 --- a/lldb/source/API/SBMemoryRegionInfo.cpp +++ b/lldb/source/API/SBMemoryRegionInfo.cpp @@ -116,14 +116,14 @@ const char *SBMemoryRegionInfo::GetName() { return m_opaque_up->GetName().AsCString(); } -bool SBMemoryRegionInfo::HasDirtyMemoryPageList() {LLDB_RECORD_METHOD_NO_ARGS(bool, SBMemoryRegionInfo, HasDirtyMemoryPageList); - +bool SBMemoryRegionInfo::HasDirtyMemoryPageList() { + LLDB_RECORD_METHOD_NO_ARGS(bool, SBMemoryRegionInfo, HasDirtyMemoryPageList); return m_opaque_up->GetDirtyPageList().hasValue(); } -uint32_t SBMemoryRegionInfo::GetNumDirtyPages() {LLDB_RECORD_METHOD_NO_ARGS(uint32_t, SBMemoryRegionInfo, GetNumDirtyPages); - +uint32_t SBMemoryRegionInfo::GetNumDirtyPages() { + LLDB_RECORD_METHOD_NO_ARGS(uint32_t, SBMemoryRegionInfo, GetNumDirtyPages); uint32_t num_dirty_pages = 0; llvm::Optional> dirty_page_list = @@ -134,8 +134,9 @@ uint32_t SBMemoryRegionInfo::GetNumDirtyPages() {LLDB_RECORD_METHOD_NO_ARGS(uint return num_dirty_pages; } -addr_t SBMemoryRegionInfo::GetDirtyPageAddressAtIndex(uint32_t idx) {LLDB_RECORD_METHOD(lldb::addr_t, SBMemoryRegionInfo, GetDirtyPageAddressAtIndex, (uint32_t), idx); - +addr_t SBMemoryRegionInfo::GetDirtyPageAddressAtIndex(uint32_t idx) { + LLDB_RECORD_METHOD(lldb::addr_t, SBMemoryRegionInfo, + GetDirtyPageAddressAtIndex, (uint32_t), idx); addr_t dirty_page_addr = LLDB_INVALID_ADDRESS; const llvm::Optional> &dirty_page_list = @@ -146,8 +147,8 @@ addr_t SBMemoryRegionInfo::GetDirtyPageAddressAtIndex(uint32_t idx) {LLDB_RECORD return dirty_page_addr; } -int SBMemoryRegionInfo::GetPageSize() {LLDB_RECORD_METHOD_NO_ARGS(int, SBMemoryRegionInfo, GetPageSize); - +int SBMemoryRegionInfo::GetPageSize() { + LLDB_RECORD_METHOD_NO_ARGS(int, SBMemoryRegionInfo, GetPageSize); return m_opaque_up->GetPageSize(); } From f22ba51873509b93732015176b778465f40c6db5 Mon Sep 17 00:00:00 2001 From: Anshil Gandhi Date: Mon, 16 Aug 2021 14:56:01 -0600 Subject: [PATCH 150/700] [Remarks] Emit optimization remarks for atomics generating CAS loop Implements ORE in AtomicExpand pass to report atomics generating a compare and swap loop. Differential Revision: https://reviews.llvm.org/D106891 --- .../CodeGenCUDA/atomics-remarks-gfx90a.cu | 16 +++ .../CodeGenOpenCL/atomics-remarks-gfx90a.cl | 43 ++++++++ llvm/lib/CodeGen/AtomicExpandPass.cpp | 17 ++- .../CodeGen/AMDGPU/atomics-remarks-gfx90a.ll | 103 ++++++++++++++++++ 4 files changed, 178 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu create mode 100644 clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl create mode 100644 llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll diff --git a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu new file mode 100644 index 0000000000000..96892286fd75e --- /dev/null +++ b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -target-cpu gfx90a -Rpass=atomic-expand -S -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=GFX90A-CAS + +// REQUIRES: amdgpu-registered-target + +#include "Inputs/cuda.h" +#include + +// GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope +// GFX90A-CAS-LABEL: _Z14atomic_add_casPf +// GFX90A-CAS: flat_atomic_cmpswap v0, v[2:3], v[4:5] glc +// GFX90A-CAS: s_cbranch_execnz +__device__ float atomic_add_cas(float *p) { + return __atomic_fetch_add(p, 1.0f, memory_order_relaxed); +} diff --git a/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl new file mode 100644 index 0000000000000..127866e84e051 --- /dev/null +++ b/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl @@ -0,0 +1,43 @@ +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -Rpass=atomic-expand -S -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=REMARK + +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -Rpass=atomic-expand -S -emit-llvm -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=GFX90A-CAS + +// REQUIRES: amdgpu-registered-target + +typedef enum memory_order { + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST +} memory_order; + +typedef enum memory_scope { + memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, + memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, + memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, + memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, +#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) + memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP +#endif +} memory_scope; + +// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope [-Rpass=atomic-expand] +// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope [-Rpass=atomic-expand] +// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope [-Rpass=atomic-expand] +// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope [-Rpass=atomic-expand] +// GFX90A-CAS-LABEL: @atomic_cas +// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("workgroup-one-as") monotonic +// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("agent-one-as") monotonic +// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("one-as") monotonic +// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("wavefront-one-as") monotonic +float atomic_cas(__global atomic_float *d, float a) { + float ret1 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group); + float ret2 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_device); + float ret3 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_all_svm_devices); + float ret4 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_sub_group); +} diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 125a3be585cb5..a27d43e43a855 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/AtomicExpandUtils.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/TargetLowering.h" @@ -570,7 +571,9 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, } bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { - switch (TLI->shouldExpandAtomicRMWInIR(AI)) { + LLVMContext &Ctx = AI->getModule()->getContext(); + TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(AI); + switch (Kind) { case TargetLoweringBase::AtomicExpansionKind::None: return false; case TargetLoweringBase::AtomicExpansionKind::LLSC: { @@ -600,6 +603,18 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { expandPartwordAtomicRMW(AI, TargetLoweringBase::AtomicExpansionKind::CmpXChg); } else { + SmallVector SSNs; + Ctx.getSyncScopeNames(SSNs); + auto MemScope = SSNs[AI->getSyncScopeID()].empty() + ? "system" + : SSNs[AI->getSyncScopeID()]; + OptimizationRemarkEmitter ORE(AI->getFunction()); + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Passed", AI->getFunction()) + << "A compare and swap loop was generated for an atomic " + << AI->getOperationName(AI->getOperation()) << " operation at " + << MemScope << " memory scope"; + }); expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); } return true; diff --git a/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll new file mode 100644 index 0000000000000..240963cfe9009 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll @@ -0,0 +1,103 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=atomic-expand \ +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-CAS + +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread-one-as memory scope + +; GFX90A-CAS-LABEL: atomic_add_cas: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_agent: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_agent(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("agent") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_workgroup: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_workgroup(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_wavefront: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_wavefront(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_singlethread: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_singlethread(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_agent_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_agent_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("agent-one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_workgroup_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_workgroup_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_wavefront_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_wavefront_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_singlethread_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_singlethread_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread-one-as") monotonic, align 4 + ret void +} From f328f72e60a97b77d7f4ffb8570a931edbb0a0d5 Mon Sep 17 00:00:00 2001 From: Rob Suderman Date: Mon, 16 Aug 2021 13:47:00 -0700 Subject: [PATCH 151/700] [mlir][tosa] Fixed depthwise conv parallel/reduction indices order Reduction axis should come after all parallel axis to work with vectorization. Reviewed By: NatashaKnk Differential Revision: https://reviews.llvm.org/D108005 --- .../Linalg/IR/LinalgNamedStructuredOps.yaml | 20 +++++++++---------- .../linalg/opdsl/ops/core_named_ops.py | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml index b5199d1e40ad3..cb5db86bbe65d 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml @@ -949,19 +949,19 @@ structured_op: !LinalgStructuredOpConfig indexing_maps: !LinalgIndexingMapsConfig static_indexing_maps: - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d0, d1 * s9 + d3 * s11, d2 * s10 + d4 * s12, d5)> + s9, s10, s11, s12] -> (d0, d1 * s9 + d5 * s11, d2 * s10 + d6 * s12, d3)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d3, d4, d5, d6)> + s9, s10, s11, s12] -> (d5, d6, d3, d4)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d0, d1, d2, d5, d6)> + s9, s10, s11, s12] -> (d0, d1, d2, d3, d4)> iterator_types: - parallel - parallel - parallel - - reduction - - reduction - parallel - parallel + - reduction + - reduction assignments: - !ScalarAssign arg: O @@ -1039,23 +1039,23 @@ structured_op: !LinalgStructuredOpConfig indexing_maps: !LinalgIndexingMapsConfig static_indexing_maps: - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d0, d1 * s9 + d3 * s11, d2 * s10 + d4 * s12, d5)> + s9, s10, s11, s12] -> (d0, d1 * s9 + d5 * s11, d2 * s10 + d6 * s12, d3)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d3, d4, d5, d6)> + s9, s10, s11, s12] -> (d5, d6, d3, d4)> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] -> ()> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12] -> ()> - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8, - s9, s10, s11, s12] -> (d0, d1, d2, d5, d6)> + s9, s10, s11, s12] -> (d0, d1, d2, d3, d4)> iterator_types: - parallel - parallel - parallel - - reduction - - reduction - parallel - parallel + - reduction + - reduction assignments: - !ScalarAssign arg: O diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py index 21ca35bf1036f..0590e6721f61b 100644 --- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py @@ -209,7 +209,7 @@ def depthwise_conv2D_nhwc( Numeric casting is performed on the operands to the inner multiply, promoting them to the same data type as the accumulator/output. """ - domain(D.n, D.oh, D.ow, D.kh, D.kw, D.ic, D.cm) + domain(D.n, D.oh, D.ow, D.ic, D.cm, D.kh, D.kw) O[D.n, D.oh, D.ow, D.ic, D.cm] += cast( U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.ic]) * cast(U, K[D.kh, D.kw, D.ic, D.cm]) @@ -228,7 +228,7 @@ def depthwise_conv2D_nhwc_q( Numeric casting is performed on the operands to the inner multiply, promoting them to the same data type as the accumulator/output. """ - domain(D.n, D.oh, D.ow, D.kh, D.kw, D.ic, D.cm) + domain(D.n, D.oh, D.ow, D.ic, D.cm, D.kh, D.kw) O[D.n, D.oh, D.ow, D.ic, D.cm] += ( (cast(U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.ic]) - cast(U, IZp)) * From 2c6448cdc2f68f8c28fd0bd9404182b81306e6e6 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Sat, 14 Aug 2021 16:51:10 -0700 Subject: [PATCH 152/700] [sanitizer] Define 32bit uptr as uint This makes it consistent with uintptr_t. It's 45138f788c9b3c4ac5d9ae4479841c411c15190e with Darwin fix. Reviewed By: kstoimenov Differential Revision: https://reviews.llvm.org/D108163 --- .../sanitizer_common/sanitizer_internal_defs.h | 15 +++++++++++---- .../tests/sanitizer_bitvector_test.cpp | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h index 056b00a10e2be..db26d9e213f22 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h @@ -139,8 +139,13 @@ namespace __sanitizer { typedef unsigned long long uptr; typedef signed long long sptr; #else +# if (SANITIZER_WORDSIZE == 64) typedef unsigned long uptr; typedef signed long sptr; +# else +typedef unsigned int uptr; +typedef signed int sptr; +# endif #endif // defined(_WIN64) #if defined(__x86_64__) // Since x32 uses ILP32 data model in 64-bit hardware mode, we must use @@ -182,15 +187,17 @@ typedef uptr OFF_T; #endif typedef u64 OFF64_T; -#if (SANITIZER_WORDSIZE == 64) || SANITIZER_MAC +#if (SANITIZER_WORDSIZE == 64) typedef uptr operator_new_size_type; #else -# if defined(__s390__) && !defined(__s390x__) +# if defined(__s390__) && !defined(__s390x__) // Special case: 31-bit s390 has unsigned long as size_t. typedef unsigned long operator_new_size_type; -# else +# elif SANITIZER_MAC +typedef unsigned long operator_new_size_type; +# else typedef u32 operator_new_size_type; -# endif +# endif #endif typedef u64 tid_t; diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp index 670e96552c68f..385b6158300ca 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp @@ -71,7 +71,7 @@ void Print(const set &s) { #if defined(_WIN64) fprintf(stderr, "%llu ", *it); #else - fprintf(stderr, "%lu ", *it); + fprintf(stderr, "%zu ", *it); #endif } fprintf(stderr, "\n"); From b2aaafb8377ac1ab081e7c0d3ba92ee5eb4de07c Mon Sep 17 00:00:00 2001 From: Kostya Kortchinsky Date: Mon, 16 Aug 2021 11:59:15 -0700 Subject: [PATCH 153/700] [scudo] Use stdint types for internal types (redo) This is a redo of D108089 that broke some 32-bit builds. `scudo::uptr` was defined as an `unsigned long` on 32-b platform, while a `uintptr_t` is usually defined as an `unsigned int`. This worked, this was not consistent, particularly with regard to format string specifiers. As suggested by Vitaly, since we are including `stdint.h`, define the internal scudo integer types to those. Differential Revision: https://reviews.llvm.org/D108152 --- compiler-rt/lib/scudo/standalone/combined.h | 2 +- .../lib/scudo/standalone/internal_defs.h | 20 +++++++++---------- compiler-rt/lib/scudo/standalone/secondary.h | 2 +- .../lib/scudo/standalone/wrappers_c_checks.h | 4 +++- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index 922c2e50bb0b4..371fb783a06eb 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -920,7 +920,7 @@ class Allocator { if (!Depot->find(Hash, &RingPos, &Size)) return; for (unsigned I = 0; I != Size && I != MaxTraceSize; ++I) - Trace[I] = (*Depot)[RingPos + I]; + Trace[I] = static_cast((*Depot)[RingPos + I]); } static void getErrorInfo(struct scudo_error_info *ErrorInfo, diff --git a/compiler-rt/lib/scudo/standalone/internal_defs.h b/compiler-rt/lib/scudo/standalone/internal_defs.h index c9ffad136b78d..621fc9c45e952 100644 --- a/compiler-rt/lib/scudo/standalone/internal_defs.h +++ b/compiler-rt/lib/scudo/standalone/internal_defs.h @@ -78,16 +78,16 @@ namespace scudo { -typedef unsigned long uptr; -typedef unsigned char u8; -typedef unsigned short u16; -typedef unsigned int u32; -typedef unsigned long long u64; -typedef signed long sptr; -typedef signed char s8; -typedef signed short s16; -typedef signed int s32; -typedef signed long long s64; +typedef uintptr_t uptr; +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef intptr_t sptr; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; // The following two functions have platform specific implementations. void outputRaw(const char *Buffer); diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h index 630e64d46edf7..aa50fa98b1138 100644 --- a/compiler-rt/lib/scudo/standalone/secondary.h +++ b/compiler-rt/lib/scudo/standalone/secondary.h @@ -485,7 +485,7 @@ void *MapAllocator::allocate(Options Options, uptr Size, uptr Alignment, FillContentsMode FillContents) { if (Options.get(OptionBit::AddLargeAllocationSlack)) Size += 1UL << SCUDO_MIN_ALIGNMENT_LOG; - Alignment = Max(Alignment, 1UL << SCUDO_MIN_ALIGNMENT_LOG); + Alignment = Max(Alignment, uptr(1U) << SCUDO_MIN_ALIGNMENT_LOG); const uptr PageSize = getPageSizeCached(); uptr RoundedSize = roundUpTo(roundUpTo(Size, Alignment) + LargeBlock::getHeaderSize() + diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c_checks.h b/compiler-rt/lib/scudo/standalone/wrappers_c_checks.h index 7fc1a9600e53b..ec9c1a104e83c 100644 --- a/compiler-rt/lib/scudo/standalone/wrappers_c_checks.h +++ b/compiler-rt/lib/scudo/standalone/wrappers_c_checks.h @@ -46,8 +46,10 @@ inline bool checkPosixMemalignAlignment(uptr Alignment) { // builtin supported by recent clang & GCC if it exists, otherwise fallback to a // costly division. inline bool checkForCallocOverflow(uptr Size, uptr N, uptr *Product) { -#if __has_builtin(__builtin_umull_overflow) +#if __has_builtin(__builtin_umull_overflow) && (SCUDO_WORDSIZE == 64U) return __builtin_umull_overflow(Size, N, Product); +#elif __has_builtin(__builtin_umul_overflow) && (SCUDO_WORDSIZE == 32U) + return __builtin_umul_overflow(Size, N, Product); #else *Product = Size * N; if (!Size) From 9236dea255a87d7f48a6b7a27ba2fa21a676304d Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 16 Aug 2021 22:58:12 +0100 Subject: [PATCH 154/700] [ARM] Create MQQPR and MQQQQPR register classes Similar to the MQPR register class as the MVE equivalent to QPR, this adds MQQPR and MQQQQPR register classes for the MVE equivalents of QQPR and QQQQPR registers. The MVE MQPR seemed have worked out quite well, and adding MQQPR and MQQQQPR allows us to a little more accurately specify the number of registers, calculating register pressure limits a little better. Differential Revision: https://reviews.llvm.org/D107463 --- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 58 +++--- llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp | 7 + llvm/lib/Target/ARM/ARMISelLowering.cpp | 8 +- llvm/lib/Target/ARM/ARMInstrMVE.td | 16 +- llvm/lib/Target/ARM/ARMRegisterInfo.td | 6 + .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 8 +- .../ARM/Disassembler/ARMDisassembler.cpp | 22 ++- llvm/test/CodeGen/Thumb2/mve-vld2.ll | 6 +- llvm/test/CodeGen/Thumb2/mve-vld4.ll | 178 +++++++++--------- 9 files changed, 165 insertions(+), 144 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 9b058ff7dbcbd..643bc6d2d4e02 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1241,7 +1241,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, llvm_unreachable("Unknown reg class!"); break; case 32: - if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) { + if (ARM::QQPRRegClass.hasSubClassEq(RC) || + ARM::MQQPRRegClass.hasSubClassEq(RC) || + ARM::DQuadRegClass.hasSubClassEq(RC)) { if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) && Subtarget.hasNEON()) { // FIXME: It's possible to only store part of the QQ register if the @@ -1267,7 +1269,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, llvm_unreachable("Unknown reg class!"); break; case 64: - if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) { + if (ARM::QQQQPRRegClass.hasSubClassEq(RC) || + ARM::MQQQQPRRegClass.hasSubClassEq(RC)) { MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::VSTMDIA)) .addFrameIndex(FI) .add(predOps(ARMCC::AL)) @@ -1473,31 +1476,34 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, llvm_unreachable("Unknown reg class!"); break; case 32: - if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) { - if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) && - Subtarget.hasNEON()) { - BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg) - .addFrameIndex(FI) - .addImm(16) - .addMemOperand(MMO) - .add(predOps(ARMCC::AL)); - } else { - MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) - .addFrameIndex(FI) - .add(predOps(ARMCC::AL)) - .addMemOperand(MMO); - MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI); - if (Register::isPhysicalRegister(DestReg)) - MIB.addReg(DestReg, RegState::ImplicitDefine); - } - } else - llvm_unreachable("Unknown reg class!"); - break; + if (ARM::QQPRRegClass.hasSubClassEq(RC) || + ARM::MQQPRRegClass.hasSubClassEq(RC) || + ARM::DQuadRegClass.hasSubClassEq(RC)) { + if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) && + Subtarget.hasNEON()) { + BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg) + .addFrameIndex(FI) + .addImm(16) + .addMemOperand(MMO) + .add(predOps(ARMCC::AL)); + } else { + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) + .addFrameIndex(FI) + .add(predOps(ARMCC::AL)) + .addMemOperand(MMO); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI); + if (Register::isPhysicalRegister(DestReg)) + MIB.addReg(DestReg, RegState::ImplicitDefine); + } + } else + llvm_unreachable("Unknown reg class!"); + break; case 64: - if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) { + if (ARM::QQQQPRRegClass.hasSubClassEq(RC) || + ARM::MQQQQPRRegClass.hasSubClassEq(RC)) { MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) .addFrameIndex(FI) .add(predOps(ARMCC::AL)) diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 4883e5693f87d..5b2a1f40703b5 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -263,6 +263,13 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, case ARM::QQQQPRRegClassID: if (MF.getSubtarget().hasNEON()) return Super; + break; + case ARM::MQPRRegClassID: + case ARM::MQQPRRegClassID: + case ARM::MQQQQPRRegClassID: + if (MF.getSubtarget().hasMVEIntegerOps()) + return Super; + break; } Super = *I++; } while (Super); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index e3f5f4948bf64..cae7170c6d3b4 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1853,12 +1853,18 @@ ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive // MVE Q registers. - if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { + if (Subtarget->hasNEON()) { if (VT == MVT::v4i64) return &ARM::QQPRRegClass; if (VT == MVT::v8i64) return &ARM::QQQQPRRegClass; } + if (Subtarget->hasMVEIntegerOps()) { + if (VT == MVT::v4i64) + return &ARM::MQQPRRegClass; + if (VT == MVT::v8i64) + return &ARM::MQQQQPRRegClass; + } return TargetLowering::getRegClassFor(VT); } diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 372893814092e..5577de05a6e35 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -97,7 +97,7 @@ def VecList2QAsmOperand : AsmOperandClass { "q-registers in range [q0,q7]"; } -def VecList2Q : RegisterOperand { +def VecList2Q : RegisterOperand { let ParserMatchClass = VecList2QAsmOperand; let PrintMethod = "printMVEVectorList<2>"; } @@ -110,7 +110,7 @@ def VecList4QAsmOperand : AsmOperandClass { "q-registers in range [q0,q7]"; } -def VecList4Q : RegisterOperand { +def VecList4Q : RegisterOperand { let ParserMatchClass = VecList4QAsmOperand; let PrintMethod = "printMVEVectorList<4>"; } @@ -6037,13 +6037,13 @@ multiclass MVE_vst24_patterns { def : Pat<(int_arm_mve_vst2q i32:$addr, (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)), (!cast("MVE_VST2"#stage#"_"#lanesize) - (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + (REG_SEQUENCE MQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), t2_addr_offset_none:$addr)>; foreach stage = [0,1] in def : Pat<(i32 (MVEVST2UPD i32:$addr, (i32 32), (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage))), (i32 (!cast("MVE_VST2"#stage#"_"#lanesize#_wb) - (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + (REG_SEQUENCE MQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), t2_addr_offset_none:$addr))>; foreach stage = [0,1,2,3] in @@ -6051,16 +6051,16 @@ multiclass MVE_vst24_patterns { (VT MQPR:$v0), (VT MQPR:$v1), (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)), (!cast("MVE_VST4"#stage#"_"#lanesize) - (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, - VT:$v2, qsub_2, VT:$v3, qsub_3), + (REG_SEQUENCE MQQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), t2_addr_offset_none:$addr)>; foreach stage = [0,1,2,3] in def : Pat<(i32 (MVEVST4UPD i32:$addr, (i32 64), (VT MQPR:$v0), (VT MQPR:$v1), (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage))), (i32 (!cast("MVE_VST4"#stage#"_"#lanesize#_wb) - (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, - VT:$v2, qsub_2, VT:$v3, qsub_3), + (REG_SEQUENCE MQQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), t2_addr_offset_none:$addr))>; } defm : MVE_vst24_patterns<8, v16i8>; diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td index 93f3b75823480..9752b3166b454 100644 --- a/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -557,6 +557,9 @@ def QQPR : RegisterClass<"ARM", [v4i64], 256, (add Tuples2Q)> { let AltOrderSelect = [{ return 1; }]; } +// Same as QQPR but for MVE, containing the 7 register pairs made up from Q0-Q7. +def MQQPR : RegisterClass<"ARM", [v4i64], 256, (trunc QQPR, 7)>; + // Tuples of 4 D regs that isn't also a pair of Q regs. def TuplesOE4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3], [(decimate (shl DPR, 1), 2), @@ -580,6 +583,9 @@ def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (add Tuples2QQ)> { let AltOrderSelect = [{ return 1; }]; } +// Same as QQPR but for MVE, containing the 5 register quads made up from Q0-Q7. +def MQQQQPR : RegisterClass<"ARM", [v8i64], 256, (trunc QQQQPR, 5)>; + // Pseudo-registers representing 2-spaced consecutive D registers. def Tuples2DSpc : RegisterTuples<[dsub_0, dsub_2], diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index e410fe0aeff28..339919db2bdea 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -3343,16 +3343,16 @@ class ARMOperand : public MCParsedAsmOperand { // regs) or q0-q4 (for 4) // // The MVE instructions taking a register range of this kind will - // need an operand in the QQPR or QQQQPR class, representing the + // need an operand in the MQQPR or MQQQQPR class, representing the // entire range as a unit. So we must translate into that class, // by finding the index of the base register in the MQPR reg // class, and returning the super-register at the corresponding // index in the target class. const MCRegisterClass *RC_in = &ARMMCRegisterClasses[ARM::MQPRRegClassID]; - const MCRegisterClass *RC_out = (VectorList.Count == 2) ? - &ARMMCRegisterClasses[ARM::QQPRRegClassID] : - &ARMMCRegisterClasses[ARM::QQQQPRRegClassID]; + const MCRegisterClass *RC_out = + (VectorList.Count == 2) ? &ARMMCRegisterClasses[ARM::MQQPRRegClassID] + : &ARMMCRegisterClasses[ARM::MQQQQPRRegClassID]; unsigned I, E = RC_out->getNumRegs(); for (I = 0; I < E; I++) diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 51fd450345345..f9a786840db00 100644 --- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -227,10 +227,12 @@ static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeQQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, const void *Decoder); -static DecodeStatus DecodeQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst, @@ -6154,9 +6156,9 @@ static const uint16_t QQPRDecoderTable[] = { ARM::Q4_Q5, ARM::Q5_Q6, ARM::Q6_Q7 }; -static DecodeStatus DecodeQQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder) { +static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { if (RegNo > 6) return MCDisassembler::Fail; @@ -6170,9 +6172,9 @@ static const uint16_t QQQQPRDecoderTable[] = { ARM::Q3_Q4_Q5_Q6, ARM::Q4_Q5_Q6_Q7 }; -static DecodeStatus DecodeQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder) { +static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { if (RegNo > 4) return MCDisassembler::Fail; diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll index 5e5328bf755d2..93967f052b0aa 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -77,8 +77,9 @@ define void @vld2_v16i32(<32 x i32> *%src, <16 x i32> *%dst) { ; CHECK-NEXT: vld20.32 {q5, q6}, [r0] ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vld20.32 {q1, q2}, [r3] -; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4 ; CHECK-NEXT: vld21.32 {q5, q6}, [r0] +; CHECK-NEXT: vadd.i32 q3, q3, q4 ; CHECK-NEXT: vld21.32 {q1, q2}, [r3] ; CHECK-NEXT: vstrw.32 q3, [r1, #48] ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 @@ -479,8 +480,9 @@ define void @vld2_v16f32(<32 x float> *%src, <16 x float> *%dst) { ; CHECK-NEXT: vld20.32 {q5, q6}, [r0] ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vld20.32 {q1, q2}, [r3] -; CHECK-NEXT: vadd.f32 q3, q3, q4 +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4 ; CHECK-NEXT: vld21.32 {q5, q6}, [r0] +; CHECK-NEXT: vadd.f32 q3, q3, q4 ; CHECK-NEXT: vld21.32 {q1, q2}, [r3] ; CHECK-NEXT: vstrw.32 q3, [r1, #48] ; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll index c1b984761dcdf..74b6b8d7e2843 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -73,27 +73,27 @@ entry: define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) { ; CHECK-LABEL: vld4_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q6, q2, q3 +; CHECK-NEXT: vadd.i32 q4, q2, q3 +; CHECK-NEXT: vadd.i32 q5, q0, q1 +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.i32 q5, q2, q3 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vadd.i32 q0, q0, q6 -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vadd.i32 q5, q3, q4 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vadd.i32 q1, q1, q5 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i32>, <32 x i32>* %src, align 4 @@ -126,50 +126,46 @@ define void @vld4_v16i32(<64 x i32> *%src, <16 x i32> *%dst) { ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.i32 q4, q2, q3 -; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q6, q0, q1 ; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vadd.i32 q6, q5, q6 -; CHECK-NEXT: vstrw.32 q6, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vadd.i32 q2, q6, q2 +; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q2, q3, q4 ; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vadd.i32 q1, q3, q5 -; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q1, q3, q1 ; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload -; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 -; CHECK-NEXT: vadd.i32 q2, q3, q4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 ; CHECK-NEXT: vadd.i32 q1, q5, q6 +; CHECK-NEXT: vadd.i32 q2, q3, q4 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vadd.i32 q1, q2, q1 ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #112 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} @@ -349,27 +345,27 @@ entry: define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vld4_v16i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i16 q6, q2, q3 +; CHECK-NEXT: vadd.i16 q4, q2, q3 +; CHECK-NEXT: vadd.i16 q5, q0, q1 +; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.i16 q4, q5, q4 +; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.i16 q5, q2, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vld40.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vadd.i16 q0, q0, q6 -; CHECK-NEXT: vld41.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.16 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vadd.i16 q5, q3, q4 -; CHECK-NEXT: vadd.i16 q1, q1, q2 -; CHECK-NEXT: vadd.i16 q1, q1, q5 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vadd.i16 q0, q0, q5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i16>, <64 x i16>* %src, align 2 @@ -871,27 +867,27 @@ entry: define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) { ; CHECK-LABEL: vld4_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q6, q2, q3 +; CHECK-NEXT: vadd.f32 q4, q2, q3 +; CHECK-NEXT: vadd.f32 q5, q0, q1 +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.f32 q5, q2, q3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vadd.f32 q0, q0, q6 -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vadd.f32 q5, q3, q4 -; CHECK-NEXT: vadd.f32 q1, q1, q2 -; CHECK-NEXT: vadd.f32 q1, q1, q5 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vadd.f32 q0, q0, q5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x float>, <32 x float>* %src, align 4 @@ -924,50 +920,46 @@ define void @vld4_v16f32(<64 x float> *%src, <16 x float> *%dst) { ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! ; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.f32 q4, q2, q3 -; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vadd.f32 q6, q0, q1 ; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vadd.f32 q6, q5, q6 -; CHECK-NEXT: vstrw.32 q6, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 q0, q1, q0 +; CHECK-NEXT: vadd.f32 q2, q6, q2 +; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vadd.f32 q2, q3, q4 ; CHECK-NEXT: vadd.f32 q0, q0, q2 -; CHECK-NEXT: vadd.f32 q1, q3, q5 -; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] +; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 q0, q0, q5 +; CHECK-NEXT: vadd.f32 q0, q0, q2 +; CHECK-NEXT: vadd.f32 q1, q3, q1 ; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload -; CHECK-NEXT: vadd.f32 q1, q2, q1 -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 -; CHECK-NEXT: vadd.f32 q2, q3, q4 ; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 ; CHECK-NEXT: vadd.f32 q1, q5, q6 +; CHECK-NEXT: vadd.f32 q2, q3, q4 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vadd.f32 q1, q2, q1 ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: add sp, #112 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} From 8e4efad9917ce0b7d1751c34a8d6907e610050e6 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Mon, 16 Aug 2021 22:12:00 +0000 Subject: [PATCH 155/700] [libc] Optimize Loop strategy Since the precondition for loop is `size >= T::kSize` we always expect at least one run of the loop. This patch transforms the for-loop into a do/while-loop which saves at least one test. We also add a second template parameter to allow the Tail operation to differ from the loop operation. --- libc/src/string/memory_utils/elements.h | 33 ++++++++++++++++++------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h index d3fadbb11c611..5f19e861e19f4 100644 --- a/libc/src/string/memory_utils/elements.h +++ b/libc/src/string/memory_utils/elements.h @@ -234,32 +234,47 @@ template struct HeadTail { // // Precondition: // - size >= T::kSize -template struct Loop { +template struct Loop { + static_assert(T::kSize == TailT::kSize, + "Tail type must have the same size as T"); + static void Copy(char *__restrict dst, const char *__restrict src, size_t size) { - for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize) + size_t offset = 0; + do { T::Copy(dst + offset, src + offset); - Tail::Copy(dst, src, size); + offset += T::kSize; + } while (offset < size - T::kSize); + Tail::Copy(dst, src, size); } static bool Equals(const char *lhs, const char *rhs, size_t size) { - for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize) + size_t offset = 0; + do { if (!T::Equals(lhs + offset, rhs + offset)) return false; - return Tail::Equals(lhs, rhs, size); + offset += T::kSize; + } while (offset < size - T::kSize); + return Tail::Equals(lhs, rhs, size); } static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) { - for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize) + size_t offset = 0; + do { if (!T::Equals(lhs + offset, rhs + offset)) return T::ThreeWayCompare(lhs + offset, rhs + offset); - return Tail::ThreeWayCompare(lhs, rhs, size); + offset += T::kSize; + } while (offset < size - T::kSize); + return Tail::ThreeWayCompare(lhs, rhs, size); } static void SplatSet(char *dst, const unsigned char value, size_t size) { - for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize) + size_t offset = 0; + do { T::SplatSet(dst + offset, value); - Tail::SplatSet(dst, value, size); + offset += T::kSize; + } while (offset < size - T::kSize); + Tail::SplatSet(dst, value, size); } }; From ae0628f716cc05ad28adf963538a67e69d58d21d Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 16 Aug 2021 15:21:54 -0700 Subject: [PATCH 156/700] [sanitizer] Fix MAC build after D108163 --- compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc index e3b664f68b618..c78f944df8b74 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc @@ -186,7 +186,7 @@ size_t __sanitizer_mz_size(malloc_zone_t* zone, const void* ptr) { extern "C" SANITIZER_INTERFACE_ATTRIBUTE -void *__sanitizer_mz_malloc(malloc_zone_t *zone, uptr size) { +void *__sanitizer_mz_malloc(malloc_zone_t *zone, size_t size) { COMMON_MALLOC_ENTER(); COMMON_MALLOC_MALLOC(size); return p; From 83457d398df15a2688bab72a9b658779e51efbe2 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Mon, 16 Aug 2021 22:28:53 +0000 Subject: [PATCH 157/700] [libc] dedup handling of size 4 for memset --- libc/src/string/memory_utils/memset_utils.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/libc/src/string/memory_utils/memset_utils.h b/libc/src/string/memory_utils/memset_utils.h index be1048a9be03b..5b955a3e30b1e 100644 --- a/libc/src/string/memory_utils/memset_utils.h +++ b/libc/src/string/memory_utils/memset_utils.h @@ -63,8 +63,6 @@ inline static void GeneralPurposeMemset(char *dst, unsigned char value, return SplatSet<_2>(dst, value); if (count == 3) return SplatSet<_3>(dst, value); - if (count == 4) - return SplatSet<_4>(dst, value); if (count <= 8) return SplatSet>(dst, value, count); if (count <= 16) From 913b5d2f7af71b20be332064787baa1ec3f570c0 Mon Sep 17 00:00:00 2001 From: Afanasyev Ivan Date: Mon, 16 Aug 2021 15:26:57 -0700 Subject: [PATCH 158/700] [AsmPrinter] fix nullptr dereference for MBBs with hasAddressTaken property without BB Basic block pointer is dereferenced unconditionally for MBBs with hasAddressTaken property. MBBs might have hasAddressTaken property without reference to BB. Backend developers must assign fake BB to MBB to workaround this issue and it should be fixed. Reviewed By: rnk Differential Revision: https://reviews.llvm.org/D108092 --- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 7171bfdd28e2d..4f48d007a1530 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -3270,21 +3270,21 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { // reference the block. It is possible that there is more than one label // here, because multiple LLVM BB's may have been RAUW'd to this block after // the references were generated. + const BasicBlock *BB = MBB.getBasicBlock(); if (MBB.hasAddressTaken()) { - const BasicBlock *BB = MBB.getBasicBlock(); if (isVerbose()) OutStreamer->AddComment("Block address taken"); // MBBs can have their address taken as part of CodeGen without having // their corresponding BB's address taken in IR - if (BB->hasAddressTaken()) + if (BB && BB->hasAddressTaken()) for (MCSymbol *Sym : MMI->getAddrLabelSymbolToEmit(BB)) OutStreamer->emitLabel(Sym); } // Print some verbose block comments. if (isVerbose()) { - if (const BasicBlock *BB = MBB.getBasicBlock()) { + if (BB) { if (BB->hasName()) { BB->printAsOperand(OutStreamer->GetCommentOS(), /*PrintType=*/false, BB->getModule()); From eec3495a9d8060ebd0a90fb8b84f51ed24cf8c9d Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Sun, 15 Aug 2021 17:19:33 -0700 Subject: [PATCH 159/700] [M68k] Do not pass llvm::Function& to M68kCCState Previously we're passing `llvm::Function&` into `M68kCCState` to lower arguments in fastcc. However, that reference might not be available if it's a library call and we only need its argument types. Therefore, now we're simply passing a list of argument llvm::Type-s. This fixes PR-50752. Differential Revision: https://reviews.llvm.org/D108101 --- llvm/lib/Target/M68k/M68kCallingConv.h | 22 +++++++++++----------- llvm/lib/Target/M68k/M68kISelLowering.cpp | 13 ++++++++----- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/M68k/M68kCallingConv.h b/llvm/lib/Target/M68k/M68kCallingConv.h index 18f72c95cedb2..20ffa993897f0 100644 --- a/llvm/lib/Target/M68k/M68kCallingConv.h +++ b/llvm/lib/Target/M68k/M68kCallingConv.h @@ -24,14 +24,13 @@ namespace llvm { /// Custom state to propagate llvm type info to register CC assigner -class M68kCCState : public CCState { -public: - const llvm::Function &F; +struct M68kCCState : public CCState { + ArrayRef ArgTypeList; - M68kCCState(const llvm::Function &F, CallingConv::ID CC, bool IsVarArg, + M68kCCState(ArrayRef ArgTypes, CallingConv::ID CC, bool IsVarArg, MachineFunction &MF, SmallVectorImpl &Locs, LLVMContext &C) - : CCState(CC, IsVarArg, MF, Locs, C), F(F) {} + : CCState(CC, IsVarArg, MF, Locs, C), ArgTypeList(ArgTypes) {} }; /// NOTE this function is used to select registers for formal arguments and call @@ -39,7 +38,7 @@ class M68kCCState : public CCState { inline bool CC_M68k_Any_AssignToReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, ISD::ArgFlagsTy &ArgFlags, CCState &State) { - M68kCCState CCInfo = static_cast(State); + const M68kCCState &CCInfo = static_cast(State); static const MCPhysReg DataRegList[] = {M68k::D0, M68k::D1, M68k::A0, M68k::A1}; @@ -52,14 +51,15 @@ inline bool CC_M68k_Any_AssignToReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT, M68k::D1, }; - auto I = CCInfo.F.arg_begin(); + const auto &ArgTypes = CCInfo.ArgTypeList; + auto I = ArgTypes.begin(), End = ArgTypes.end(); int No = ValNo; - while (No > 0) { - No -= I->getType()->isIntegerTy(64) ? 2 : 1; - I++; + while (No > 0 && I != End) { + No -= (*I)->isIntegerTy(64) ? 2 : 1; + ++I; } - bool IsPtr = I != CCInfo.F.arg_end() && I->getType()->isPointerTy(); + bool IsPtr = I != End && (*I)->isPointerTy(); unsigned Reg = IsPtr ? State.AllocateReg(AddrRegList) : State.AllocateReg(DataRegList); diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp index 3e7cee9889d7c..f00083aaa1d89 100644 --- a/llvm/lib/Target/M68k/M68kISelLowering.cpp +++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp @@ -519,9 +519,10 @@ SDValue M68kTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; - // It is empty for LibCall - const Function *CalleeFunc = CLI.CB ? CLI.CB->getCalledFunction() : nullptr; - M68kCCState CCInfo(*CalleeFunc, CallConv, IsVarArg, MF, ArgLocs, + SmallVector ArgTypes; + for (const auto &Arg : CLI.getArgs()) + ArgTypes.emplace_back(Arg.Ty); + M68kCCState CCInfo(ArgTypes, CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_M68k); @@ -876,8 +877,10 @@ SDValue M68kTargetLowering::LowerFormalArguments( // Assign locations to all of the incoming arguments. SmallVector ArgLocs; - M68kCCState CCInfo(MF.getFunction(), CCID, IsVarArg, MF, ArgLocs, - *DAG.getContext()); + SmallVector ArgTypes; + for (const Argument &Arg : MF.getFunction().args()) + ArgTypes.emplace_back(Arg.getType()); + M68kCCState CCInfo(ArgTypes, CCID, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CC_M68k); From 3a05af12b3a716e2cef3a8b6e51500e3b0f1d449 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 16 Aug 2021 15:42:17 -0700 Subject: [PATCH 160/700] Revert "[sanitizer] Fix MAC build after D108163" They still fail to fix Darwin builds https://green.lab.llvm.org/green/job/clang-stage1-RA/23399/consoleFull#462858634a1ca8a51-895e-46c6-af87-ce24fa4cd561 This reverts commit ae0628f716cc05ad28adf963538a67e69d58d21d. This reverts commit 2c6448cdc2f68f8c28fd0bd9404182b81306e6e6. --- .../sanitizer_common/sanitizer_internal_defs.h | 15 ++++----------- .../lib/sanitizer_common/sanitizer_malloc_mac.inc | 2 +- .../tests/sanitizer_bitvector_test.cpp | 2 +- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h index db26d9e213f22..056b00a10e2be 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h @@ -139,13 +139,8 @@ namespace __sanitizer { typedef unsigned long long uptr; typedef signed long long sptr; #else -# if (SANITIZER_WORDSIZE == 64) typedef unsigned long uptr; typedef signed long sptr; -# else -typedef unsigned int uptr; -typedef signed int sptr; -# endif #endif // defined(_WIN64) #if defined(__x86_64__) // Since x32 uses ILP32 data model in 64-bit hardware mode, we must use @@ -187,17 +182,15 @@ typedef uptr OFF_T; #endif typedef u64 OFF64_T; -#if (SANITIZER_WORDSIZE == 64) +#if (SANITIZER_WORDSIZE == 64) || SANITIZER_MAC typedef uptr operator_new_size_type; #else -# if defined(__s390__) && !defined(__s390x__) +# if defined(__s390__) && !defined(__s390x__) // Special case: 31-bit s390 has unsigned long as size_t. typedef unsigned long operator_new_size_type; -# elif SANITIZER_MAC -typedef unsigned long operator_new_size_type; -# else +# else typedef u32 operator_new_size_type; -# endif +# endif #endif typedef u64 tid_t; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc index c78f944df8b74..e3b664f68b618 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc @@ -186,7 +186,7 @@ size_t __sanitizer_mz_size(malloc_zone_t* zone, const void* ptr) { extern "C" SANITIZER_INTERFACE_ATTRIBUTE -void *__sanitizer_mz_malloc(malloc_zone_t *zone, size_t size) { +void *__sanitizer_mz_malloc(malloc_zone_t *zone, uptr size) { COMMON_MALLOC_ENTER(); COMMON_MALLOC_MALLOC(size); return p; diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp index 385b6158300ca..670e96552c68f 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp @@ -71,7 +71,7 @@ void Print(const set &s) { #if defined(_WIN64) fprintf(stderr, "%llu ", *it); #else - fprintf(stderr, "%zu ", *it); + fprintf(stderr, "%lu ", *it); #endif } fprintf(stderr, "\n"); From 7256c05ecb76137f1e4ac255e128fb9c73d6b4a5 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Sat, 14 Aug 2021 16:51:10 -0700 Subject: [PATCH 161/700] [sanitizer] Define 32bit uptr as uint This makes it consistent with uintptr_t. It's 45138f788c9b3c4ac5d9ae4479841c411c15190e with Darwin fix. Reviewed By: kstoimenov Differential Revision: https://reviews.llvm.org/D108163 --- compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h | 5 +++++ .../lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h index 056b00a10e2be..45165f6269513 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h @@ -139,8 +139,13 @@ namespace __sanitizer { typedef unsigned long long uptr; typedef signed long long sptr; #else +# if (SANITIZER_WORDSIZE == 64) || SANITIZER_MAC typedef unsigned long uptr; typedef signed long sptr; +# else +typedef unsigned int uptr; +typedef signed int sptr; +# endif #endif // defined(_WIN64) #if defined(__x86_64__) // Since x32 uses ILP32 data model in 64-bit hardware mode, we must use diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp index 670e96552c68f..385b6158300ca 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp @@ -71,7 +71,7 @@ void Print(const set &s) { #if defined(_WIN64) fprintf(stderr, "%llu ", *it); #else - fprintf(stderr, "%lu ", *it); + fprintf(stderr, "%zu ", *it); #endif } fprintf(stderr, "\n"); From 0d822da2bdda30a6a79971f9d160c82a4566f6a6 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Mon, 16 Aug 2021 16:07:47 -0700 Subject: [PATCH 162/700] [NFC] Remove/replace some confusing attribute getters on Function --- llvm/include/llvm/IR/Function.h | 12 ------------ .../lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 2 +- llvm/lib/IR/Function.cpp | 12 ------------ llvm/lib/Transforms/Utils/BuildLibCalls.cpp | 4 ++-- 4 files changed, 3 insertions(+), 27 deletions(-) diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index 4c3758ebdb090..f07e5770f01a2 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -440,11 +440,6 @@ class Function : public GlobalObject, public ilist_node { /// `undef` or `poison` value is passed from the list of attributes. void removeParamUndefImplyingAttrs(unsigned ArgNo); - /// check if an attributes is in the list of attributes. - bool hasAttribute(unsigned i, Attribute::AttrKind Kind) const { - return getAttributes().hasAttribute(i, Kind); - } - /// check if an attributes is in the list of attributes. bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const { return getAttributes().hasParamAttr(ArgNo, Kind); @@ -470,17 +465,10 @@ class Function : public GlobalObject, public ilist_node { return AttributeSets.getAttribute(i, Kind); } - /// adds the dereferenceable attribute to the list of attributes. - void addDereferenceableAttr(unsigned i, uint64_t Bytes); - /// adds the dereferenceable attribute to the list of attributes for /// the given arg. void addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes); - /// adds the dereferenceable_or_null attribute to the list of - /// attributes. - void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes); - /// adds the dereferenceable_or_null attribute to the list of /// attributes for the given arg. void addDereferenceableOrNullParamAttr(unsigned ArgNo, uint64_t Bytes); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index aebd89e93329e..411cee34ae014 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2668,7 +2668,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, TargetLowering::ArgListEntry Entry; Entry.Node = GuardVal; Entry.Ty = FnTy->getParamType(0); - if (GuardCheckFn->hasAttribute(1, Attribute::AttrKind::InReg)) + if (GuardCheckFn->hasParamAttribute(0, Attribute::AttrKind::InReg)) Entry.IsInReg = true; Args.push_back(Entry); diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 2755e5356c333..da846dd55c5f3 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -601,24 +601,12 @@ void Function::removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) { setAttributes(PAL); } -void Function::addDereferenceableAttr(unsigned i, uint64_t Bytes) { - AttributeList PAL = getAttributes(); - PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes); - setAttributes(PAL); -} - void Function::addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes) { AttributeList PAL = getAttributes(); PAL = PAL.addDereferenceableParamAttr(getContext(), ArgNo, Bytes); setAttributes(PAL); } -void Function::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) { - AttributeList PAL = getAttributes(); - PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes); - setAttributes(PAL); -} - void Function::addDereferenceableOrNullParamAttr(unsigned ArgNo, uint64_t Bytes) { AttributeList PAL = getAttributes(); diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index c42eee7aeddc1..f49abc61f746b 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -96,7 +96,7 @@ static bool setDoesNotThrow(Function &F) { } static bool setRetDoesNotAlias(Function &F) { - if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias)) + if (F.hasRetAttribute(Attribute::NoAlias)) return false; F.addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); ++NumNoAlias; @@ -145,7 +145,7 @@ static bool setSignExtendedArg(Function &F, unsigned ArgNo) { static bool setRetNoUndef(Function &F) { if (!F.getReturnType()->isVoidTy() && - !F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoUndef)) { + !F.hasRetAttribute(Attribute::NoUndef)) { F.addAttribute(AttributeList::ReturnIndex, Attribute::NoUndef); ++NumNoUndef; return true; From 797fe59e6b9512652c0ae5a6b69a3c6f5a573fcd Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 16 Aug 2021 16:15:57 -0700 Subject: [PATCH 163/700] [tsan] Fix GCC 8.3 build after D107911 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc 8.3 reports: __tsan::v3::Event::type’ is too small to hold all values of ‘enum class __tsan::v3::EventType’ --- compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp | 6 +++--- compiler-rt/lib/tsan/rtl/tsan_trace.h | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp index 49e867a63aa92..db5070180442a 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp @@ -472,7 +472,7 @@ void TraceReplay(Trace *trace, TracePart *last, Event *last_pos, Sid sid, for (Event *evp = &part->events[0]; evp < end; evp++) { Event *evp0 = evp; if (!evp->is_access && !evp->is_func) { - switch (evp->type) { + switch (evp->GetType()) { case EventType::kTime: { auto *ev = reinterpret_cast(evp); ev_sid = static_cast(ev->sid); @@ -573,7 +573,7 @@ bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr, [&](Sid ev_sid, Epoch ev_epoch, Event *evp) { bool match = ev_sid == sid && ev_epoch == epoch; if (evp->is_access) { - if (evp->is_func == 0 && evp->type == EventType::kAccessExt && + if (evp->is_func == 0 && evp->GetType() == EventType::kAccessExt && evp->_ == 0) // NopEvent return; auto *ev = reinterpret_cast(evp); @@ -602,7 +602,7 @@ bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr, } return; } - switch (evp->type) { + switch (evp->GetType()) { case EventType::kAccessExt: { auto *ev = reinterpret_cast(evp); uptr ev_addr = RestoreAddr(ev->addr); diff --git a/compiler-rt/lib/tsan/rtl/tsan_trace.h b/compiler-rt/lib/tsan/rtl/tsan_trace.h index a771ad9f52fd3..b48810aa82a1c 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_trace.h +++ b/compiler-rt/lib/tsan/rtl/tsan_trace.h @@ -87,13 +87,17 @@ struct Event { // Otherwise type denotes the type. u64 is_access : 1; u64 is_func : 1; - EventType type : 3; + u64 type : 3; u64 _ : 59; + + EventType GetType() const { + return static_cast(type); + } }; static_assert(sizeof(Event) == 8, "bad Event size"); // Nop event used as padding and does not affect state during replay. -static constexpr Event NopEvent = {1, 0, EventType::kAccessExt, 0}; +static constexpr Event NopEvent = {1, 0, static_cast(EventType::kAccessExt), 0}; // Compressed memory access can represent only some events with PCs // close enough to each other. Otherwise we fall back to EventAccessExt. From d8a08fae0af9aa09c108b3e7c60f192249dd2098 Mon Sep 17 00:00:00 2001 From: "Duncan P. N. Exon Smith" Date: Mon, 16 Aug 2021 16:18:11 -0700 Subject: [PATCH 164/700] Clean up test for -f{,no-}implicit-modules-uses-lock @arichardson pointed out in post-commit review for https://reviews.llvm.org/D95583 (b714f73defc8e075) that `-verify` has an optional argument that works a lot like `FileCheck`'s `-check-prefix`. Use it to simplify the test for `-fno-implicit-modules-use-lock`! --- clang/test/Modules/implicit-modules-use-lock.m | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/clang/test/Modules/implicit-modules-use-lock.m b/clang/test/Modules/implicit-modules-use-lock.m index 811b83a7e4c76..6c9582df4de52 100644 --- a/clang/test/Modules/implicit-modules-use-lock.m +++ b/clang/test/Modules/implicit-modules-use-lock.m @@ -4,20 +4,20 @@ // // RUN: rm -rf %t.cache // RUN: %clang_cc1 -fmodules -fimplicit-module-maps \ -// RUN: -fimplicit-modules-use-lock -Rmodule-lock \ +// RUN: -fimplicit-modules-use-lock -Rmodule-lock -Rmodule-build \ // RUN: -fmodules-cache-path=%t.cache -I%S/Inputs/system-out-of-date \ // RUN: -fsyntax-only %s -Wnon-modular-include-in-framework-module \ -// RUN: -Werror=non-modular-include-in-framework-module 2>&1 \ -// RUN: | FileCheck %s -check-prefix=CHECK-LOCKS +// RUN: -Werror=non-modular-include-in-framework-module \ +// RUN: -verify=locks,build // // RUN: rm -rf %t.cache // RUN: %clang_cc1 -fmodules -fimplicit-module-maps \ -// RUN: -fno-implicit-modules-use-lock -Rmodule-lock \ +// RUN: -fno-implicit-modules-use-lock -Rmodule-lock -Rmodule-build \ // RUN: -fmodules-cache-path=%t.cache -I%S/Inputs/system-out-of-date \ // RUN: -fsyntax-only %s -Wnon-modular-include-in-framework-module \ -// RUN: -Werror=non-modular-include-in-framework-module 2>&1 \ -// RUN: | FileCheck %s -check-prefix=CHECK-NO-LOCKS -allow-empty +// RUN: -Werror=non-modular-include-in-framework-module \ +// RUN: -verify=build -// CHECK-NO-LOCKS-NOT: remark: -// CHECK-LOCKS: remark: locking '{{.*}}.pcm' to build module 'X' [-Rmodule-lock] -@import X; +@import X; // locks-remark-re {{locking '{{.*}}.pcm' to build module 'X'}} \ + // build-remark {{building module 'X'}} \ + // build-remark {{finished building module 'X'}} From a1e21864df68dd38a308ad6fae965dea52dcde0c Mon Sep 17 00:00:00 2001 From: Hongtao Yu Date: Mon, 16 Aug 2021 09:21:13 -0700 Subject: [PATCH 165/700] [SamplePGO] Fixing a memory issue when creating profiles on-demand There is a on-dmeand creation of function profile during top-down processing in the sample loader when merging uninlined callees. During the profile creation, a stack string object is used to store a newly-created MD5 name, which is then used by reference as hash key in the profile map. This makes the hash key a dangling reference when later on the stack string object is deallocated. The issue only happens with md5 profile use and was exposed by context split work for CS profile. I'm making a fix by storing newly created names in the reader. Reviewed By: wenlei, wmi, wlei Differential Revision: https://reviews.llvm.org/D108142 --- llvm/include/llvm/ProfileData/SampleProf.h | 4 ++-- llvm/include/llvm/ProfileData/SampleProfReader.h | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h index 2f71bbc6bbbe6..41abde767d888 100644 --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -104,10 +104,10 @@ static inline uint64_t SPMagic(SampleProfileFormat Format = SPF_Binary) { /// current Format uses MD5 to represent the string. static inline StringRef getRepInFormat(StringRef Name, bool UseMD5, std::string &GUIDBuf) { - if (Name.empty()) + if (Name.empty() || !UseMD5) return Name; GUIDBuf = std::to_string(Function::getGUID(Name)); - return UseMD5 ? StringRef(GUIDBuf) : Name; + return GUIDBuf; } static inline uint64_t SPVersion() { return 103; } diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h index 2d5925bdb2b43..eae312477199a 100644 --- a/llvm/include/llvm/ProfileData/SampleProfReader.h +++ b/llvm/include/llvm/ProfileData/SampleProfReader.h @@ -245,6 +245,7 @@ #include #include #include +#include #include namespace llvm { @@ -407,6 +408,13 @@ class SampleProfileReader { std::string FGUID; StringRef CanonName = FunctionSamples::getCanonicalFnName(F); CanonName = getRepInFormat(CanonName, useMD5(), FGUID); + auto It = Profiles.find(CanonName); + if (It != Profiles.end()) + return &It->second; + if (!FGUID.empty()) { + assert(useMD5() && "New name should only be generated for md5 profile"); + CanonName = *MD5NameBuffer.insert(FGUID).first; + } return &Profiles[CanonName]; } @@ -503,6 +511,10 @@ class SampleProfileReader { /// Memory buffer holding the profile file. std::unique_ptr Buffer; + /// Extra name buffer holding names created on demand. + /// This should only be needed for md5 profiles. + std::unordered_set MD5NameBuffer; + /// Profile summary information. std::unique_ptr Summary; From 5a95ff2bfcc191f3ffecb69ac1d762a93b149eee Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 16 Aug 2021 16:41:17 -0700 Subject: [PATCH 166/700] [lldb] Fix -Wunused-but-set-variable --- lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 79e6b3f609651..f2ed6330e36d1 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -1584,6 +1584,7 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, } } assert(tag_decl_kind != -1); + (void)tag_decl_kind; bool clang_type_was_created = false; clang_type.SetCompilerType( &m_ast, dwarf->GetForwardDeclDieToClangType().lookup(die.GetDIE())); From fef39cc472a773fae4761deaab1c701024ad13ec Mon Sep 17 00:00:00 2001 From: Douglas Yung Date: Mon, 16 Aug 2021 17:01:57 -0700 Subject: [PATCH 167/700] [tsan] Another attempt to fix GCC 8.3 build after D107911 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This removes the -Werror compilation flag for x64 linux to work around a gcc bug. GCC 8.3 reports '__tsan::v3::Event::type’ is too small to hold all values of ‘enum class __tsan::v3::EventType’ incorrectly which gets promoted to an error and causes the build to fail. --- compiler-rt/lib/tsan/go/buildgo.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/tsan/go/buildgo.sh b/compiler-rt/lib/tsan/go/buildgo.sh index 661a2e21c2f32..489da16066dd4 100755 --- a/compiler-rt/lib/tsan/go/buildgo.sh +++ b/compiler-rt/lib/tsan/go/buildgo.sh @@ -59,7 +59,7 @@ if [ "`uname -a | grep Linux`" != "" ]; then elif [ "`uname -a | grep x86_64`" != "" ]; then SUFFIX="linux_amd64" ARCHCFLAGS="-m64 -msse4.2" - OSCFLAGS="$OSCFLAGS -ffreestanding -Wno-unused-const-variable -Werror -Wno-unknown-warning-option" + OSCFLAGS="$OSCFLAGS -ffreestanding -Wno-unused-const-variable -Wno-unknown-warning-option" elif [ "`uname -a | grep aarch64`" != "" ]; then SUFFIX="linux_arm64" ARCHCFLAGS="" From e2c97d4484468a498def2527136edeb5abc332d9 Mon Sep 17 00:00:00 2001 From: Geoffrey Martin-Noble Date: Thu, 5 Aug 2021 16:29:59 -0700 Subject: [PATCH 168/700] [MLIR] Add a bitcast method to DenseElementsAttr This method bitcasts a DenseElementsAttr elementwise to one of the same shape with a different element type. Reviewed By: rriddle Differential Revision: https://reviews.llvm.org/D107612 --- mlir/include/mlir/IR/BuiltinAttributes.h | 5 +++++ mlir/lib/Dialect/StandardOps/IR/Ops.cpp | 12 ++---------- mlir/lib/IR/BuiltinAttributes.cpp | 18 +++++++++++++++++- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/mlir/include/mlir/IR/BuiltinAttributes.h b/mlir/include/mlir/IR/BuiltinAttributes.h index 28ced2deb86f0..c71c5d4bbd4ae 100644 --- a/mlir/include/mlir/IR/BuiltinAttributes.h +++ b/mlir/include/mlir/IR/BuiltinAttributes.h @@ -560,6 +560,11 @@ class DenseElementsAttr : public ElementsAttr { /// same total number of elements as well as element type. DenseElementsAttr reshape(ShapedType newType); + /// Return a new DenseElementsAttr that has the same data as the current + /// attribute, but has bitcast elements to 'newElType'. The new type must have + /// the same bitwidth as the current element type. + DenseElementsAttr bitcast(Type newElType); + /// Generates a new DenseElementsAttr by mapping each int value to a new /// underlying APInt. The new values can represent either an integer or float. /// This underlying type must be an DenseIntElementsAttr. diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp index c0d18193b2608..0c4b1441dd4cd 100644 --- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp +++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp @@ -512,16 +512,8 @@ OpFoldResult BitcastOp::fold(ArrayRef operands) { Type resType = getResult().getType(); - if (auto denseAttr = operand.dyn_cast()) { - Type elType = getElementTypeOrSelf(resType); - return denseAttr.mapValues( - elType, [](const APFloat &f) { return f.bitcastToAPInt(); }); - } - if (auto denseAttr = operand.dyn_cast()) { - Type elType = getElementTypeOrSelf(resType); - // mapValues does its own bitcast to the target type. - return denseAttr.mapValues(elType, [](const APInt &i) { return i; }); - } + if (auto denseAttr = operand.dyn_cast()) + return denseAttr.bitcast(resType.cast().getElementType()); APInt bits; if (auto floatAttr = operand.dyn_cast()) diff --git a/mlir/lib/IR/BuiltinAttributes.cpp b/mlir/lib/IR/BuiltinAttributes.cpp index 85914be243b92..5a754dfe7c3f5 100644 --- a/mlir/lib/IR/BuiltinAttributes.cpp +++ b/mlir/lib/IR/BuiltinAttributes.cpp @@ -1024,7 +1024,6 @@ DenseElementsAttr DenseElementsAttr::reshape(ShapedType newType) { if (curType == newType) return *this; - (void)curType; assert(newType.getElementType() == curType.getElementType() && "expected the same element type"); assert(newType.getNumElements() == curType.getNumElements() && @@ -1032,6 +1031,23 @@ DenseElementsAttr DenseElementsAttr::reshape(ShapedType newType) { return DenseIntOrFPElementsAttr::getRaw(newType, getRawData(), isSplat()); } +/// Return a new DenseElementsAttr that has the same data as the current +/// attribute, but has bitcast elements such that it is now 'newType'. The new +/// type must have the same shape and element types of the same bitwidth as the +/// current type. +DenseElementsAttr DenseElementsAttr::bitcast(Type newElType) { + ShapedType curType = getType(); + Type curElType = curType.getElementType(); + if (curElType == newElType) + return *this; + + assert(getDenseElementBitWidth(newElType) == + getDenseElementBitWidth(curElType) && + "expected element types with the same bitwidth"); + return DenseIntOrFPElementsAttr::getRaw(curType.clone(newElType), + getRawData(), isSplat()); +} + DenseElementsAttr DenseElementsAttr::mapValues(Type newElementType, function_ref mapping) const { From 9ed4a94d6451046a51ef393cd62f00710820a7e8 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 16 Aug 2021 17:11:07 -0700 Subject: [PATCH 169/700] [clang] Expose unreachable fallthrough annotation warning The Linux kernel has a macro called IS_ENABLED(), which evaluates to a constant 1 or 0 based on Kconfig selections, allowing C code to be unconditionally enabled or disabled at build time. For example: int foo(struct *a, int b) { switch (b) { case 1: if (a->flag || !IS_ENABLED(CONFIG_64BIT)) return 1; __attribute__((fallthrough)); case 2: return 2; default: return 3; } } There is an unreachable warning about the fallthrough annotation in the first case because !IS_ENABLED(CONFIG_64BIT) can be evaluated to 1, which looks like return 1; __attribute__((fallthrough)); to clang. This type of warning is pointless for the Linux kernel because it does this trick all over the place due to the sheer number of configuration options that it has. Add -Wunreachable-code-fallthrough, enabled under -Wunreachable-code, so that projects that want to warn on unreachable code get this warning but projects that do not care about unreachable code can still use -Wimplicit-fallthrough without having to make changes to their code base. Fixes PR51094. Reviewed By: aaron.ballman, nickdesaulniers Differential Revision: https://reviews.llvm.org/D107933 --- clang/include/clang/Basic/DiagnosticGroups.td | 4 +++- .../clang/Basic/DiagnosticSemaKinds.td | 6 ++--- clang/lib/Sema/AnalysisBasedWarnings.cpp | 2 +- clang/test/SemaCXX/P30636.cpp | 2 +- .../SemaCXX/switch-implicit-fallthrough.cpp | 22 ++++++++++++++++++- 5 files changed, 29 insertions(+), 7 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 30dadd9731c15..17b5f419ef58c 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -821,8 +821,10 @@ def ReservedIdentifier : DiagGroup<"reserved-identifier", // under separate flags. // def UnreachableCodeLoopIncrement : DiagGroup<"unreachable-code-loop-increment">; +def UnreachableCodeFallthrough : DiagGroup<"unreachable-code-fallthrough">; def UnreachableCode : DiagGroup<"unreachable-code", - [UnreachableCodeLoopIncrement]>; + [UnreachableCodeLoopIncrement, + UnreachableCodeFallthrough]>; def UnreachableCodeBreak : DiagGroup<"unreachable-code-break">; def UnreachableCodeReturn : DiagGroup<"unreachable-code-return">; def UnreachableCodeAggressive : DiagGroup<"unreachable-code-aggressive", diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 9eaa696d99913..41152212c0d12 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -682,6 +682,9 @@ def warn_unreachable_return : Warning< def warn_unreachable_loop_increment : Warning< "loop will run at most once (loop increment never executed)">, InGroup, DefaultIgnore; +def warn_unreachable_fallthrough_attr : Warning< + "fallthrough annotation in unreachable code">, + InGroup, DefaultIgnore; def note_unreachable_silence : Note< "silence by adding parentheses to mark code as explicitly dead">; @@ -9578,9 +9581,6 @@ def err_fallthrough_attr_outside_switch : Error< "fallthrough annotation is outside switch statement">; def err_fallthrough_attr_invalid_placement : Error< "fallthrough annotation does not directly precede switch label">; -def warn_fallthrough_attr_unreachable : Warning< - "fallthrough annotation in unreachable code">, - InGroup, DefaultIgnore; def warn_unreachable_default : Warning< "default label in switch which covers all enumeration values">, diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index aa2602c8d9256..99ce143d3559d 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -1125,7 +1125,7 @@ namespace { // unreachable in all instantiations of the template. if (!IsTemplateInstantiation) S.Diag(AS->getBeginLoc(), - diag::warn_fallthrough_attr_unreachable); + diag::warn_unreachable_fallthrough_attr); markFallthroughVisited(AS); ++AnnotatedCnt; break; diff --git a/clang/test/SemaCXX/P30636.cpp b/clang/test/SemaCXX/P30636.cpp index 2e2affb0cfdea..1d5400d3ba0ed 100644 --- a/clang/test/SemaCXX/P30636.cpp +++ b/clang/test/SemaCXX/P30636.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough %s +// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough -Wunreachable-code-fallthrough %s // expected-no-diagnostics template diff --git a/clang/test/SemaCXX/switch-implicit-fallthrough.cpp b/clang/test/SemaCXX/switch-implicit-fallthrough.cpp index 9676664a7a30a..0b790813506c3 100644 --- a/clang/test/SemaCXX/switch-implicit-fallthrough.cpp +++ b/clang/test/SemaCXX/switch-implicit-fallthrough.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough %s +// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough -Wunreachable-code-fallthrough %s int fallthrough(int n) { @@ -193,6 +193,26 @@ int fallthrough_position(int n) { ; } +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunreachable-code-fallthrough" + switch (n) { + n += 300; + [[clang::fallthrough]]; // no warning here + case 221: + return 1; + [[clang::fallthrough]]; // no warning here + case 222: + return 2; + __attribute__((fallthrough)); // no warning here + case 223: + if (1) + return 3; + __attribute__((fallthrough)); // no warning here + case 224: + n += 400; + } +#pragma clang diagnostic pop + long p = static_cast(n) * n; switch (sizeof(p)) { case 9: From f27fee623d0124c64a73374d0256819396647864 Mon Sep 17 00:00:00 2001 From: Hongtao Yu Date: Mon, 16 Aug 2021 14:17:43 -0700 Subject: [PATCH 170/700] [SamplePGO][NFC] Dump function profiles in order Sample profiles are stored in a string map which is basically an unordered map. Printing out profiles by simply walking the string map doesn't enforce an order. I'm sorting the map in the decreasing order of total samples to enable a more stable dump, which is good for comparing two dumps. Reviewed By: wenlei, wlei Differential Revision: https://reviews.llvm.org/D108147 --- llvm/include/llvm/ProfileData/SampleProf.h | 5 +++++ llvm/lib/ProfileData/SampleProf.cpp | 17 +++++++++++++++++ llvm/lib/ProfileData/SampleProfReader.cpp | 6 ++++-- llvm/lib/ProfileData/SampleProfWriter.cpp | 15 +-------------- .../Inputs/profile-symbol-list.expected | 8 ++++---- 5 files changed, 31 insertions(+), 20 deletions(-) diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h index 41abde767d888..0cf1e491682b9 100644 --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -961,6 +961,11 @@ class FunctionSamples { raw_ostream &operator<<(raw_ostream &OS, const FunctionSamples &FS); +using NameFunctionSamples = std::pair; + +void sortFuncProfiles(const StringMap &ProfileMap, + std::vector &SortedProfiles); + /// Sort a LocationT->SampleT map by LocationT. /// /// It produces a sorted list of records by ascending diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp index 60e707b146d5e..adbec7aef0e01 100644 --- a/llvm/lib/ProfileData/SampleProf.cpp +++ b/llvm/lib/ProfileData/SampleProf.cpp @@ -198,6 +198,23 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS, return OS; } +void sampleprof::sortFuncProfiles( + const StringMap &ProfileMap, + std::vector &SortedProfiles) { + for (const auto &I : ProfileMap) { + assert(I.getKey() == I.second.getNameWithContext() && + "Inconsistent profile map"); + SortedProfiles.push_back( + std::make_pair(I.second.getNameWithContext(), &I.second)); + } + llvm::stable_sort(SortedProfiles, [](const NameFunctionSamples &A, + const NameFunctionSamples &B) { + if (A.second->getTotalSamples() == B.second->getTotalSamples()) + return A.first > B.first; + return A.second->getTotalSamples() > B.second->getTotalSamples(); + }); +} + unsigned FunctionSamples::getOffset(const DILocation *DIL) { return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) & 0xffff; diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp index a801ca1ef36d7..7fc95520951fb 100644 --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -66,8 +66,10 @@ void SampleProfileReader::dumpFunctionProfile(StringRef FName, /// Dump all the function profiles found on stream \p OS. void SampleProfileReader::dump(raw_ostream &OS) { - for (const auto &I : Profiles) - dumpFunctionProfile(I.getKey(), OS); + std::vector V; + sortFuncProfiles(Profiles, V); + for (const auto &I : V) + dumpFunctionProfile(I.first, OS); } /// Parse \p Input as function head. diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp index 535f879681048..3b29395bc2c92 100644 --- a/llvm/lib/ProfileData/SampleProfWriter.cpp +++ b/llvm/lib/ProfileData/SampleProfWriter.cpp @@ -43,21 +43,8 @@ using namespace sampleprof; std::error_code SampleProfileWriter::writeFuncProfiles( const StringMap &ProfileMap) { - // Sort the ProfileMap by total samples. - typedef std::pair NameFunctionSamples; std::vector V; - for (const auto &I : ProfileMap) { - assert(I.getKey() == I.second.getNameWithContext() && - "Inconsistent profile map"); - V.push_back(std::make_pair(I.second.getNameWithContext(), &I.second)); - } - llvm::stable_sort( - V, [](const NameFunctionSamples &A, const NameFunctionSamples &B) { - if (A.second->getTotalSamples() == B.second->getTotalSamples()) - return A.first > B.first; - return A.second->getTotalSamples() > B.second->getTotalSamples(); - }); - + sortFuncProfiles(ProfileMap, V); for (const auto &I : V) { if (std::error_code EC = writeSample(*I.second)) return EC; diff --git a/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list.expected b/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list.expected index d9af0de37d276..bd528b44b81c4 100644 --- a/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list.expected +++ b/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list.expected @@ -20,14 +20,14 @@ Samples collected in inlined callsites { } No inlined callsites in this function } -Function: _Z3fooi: 15422, 1220, 1 sampled lines +Function: _Z3bari: 40602, 2874, 1 sampled lines Samples collected in the function's body { - 1: 1220 + 1: 2874 } No inlined callsites in this function -Function: _Z3bari: 40602, 2874, 1 sampled lines +Function: _Z3fooi: 15422, 1220, 1 sampled lines Samples collected in the function's body { - 1: 2874 + 1: 1220 } No inlined callsites in this function ======== Dump profile symbol list ======== From 4c4ab673f10f558fa55ec97ece95ddfe109b2212 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Tue, 17 Aug 2021 10:08:08 +0900 Subject: [PATCH 171/700] [mlir][Analysis][NFC] Split FlatAffineConstraints class * Extract "value" functionality of `FlatAffineConstraints` into a new derived `FlatAffineValueConstraints` class. Current users of `FlatAffineConstraints` can use `FlatAffineValueConstraints` without additional code changes, thus NFC. * `FlatAffineConstraints` no longer associates dimensions with SSA Values. All functionality that requires this, is moved to `FlatAffineValueConstraints`. * `FlatAffineConstraints` no longer makes assumptions about where Values associated with dimensions are coming from. Differential Revision: https://reviews.llvm.org/D107725 --- mlir/include/mlir/Analysis/AffineAnalysis.h | 6 +- mlir/include/mlir/Analysis/AffineStructures.h | 604 +++++++++++------- mlir/include/mlir/Analysis/Utils.h | 27 +- .../Dialect/Linalg/Analysis/ConstraintsSet.h | 6 +- mlir/lib/Analysis/AffineAnalysis.cpp | 78 +-- mlir/lib/Analysis/AffineStructures.cpp | 270 +++++--- mlir/lib/Analysis/Utils.cpp | 28 +- .../Transforms/AffineScalarReplacement.cpp | 2 +- mlir/lib/Transforms/LoopFusion.cpp | 2 +- mlir/lib/Transforms/Utils/LoopFusionUtils.cpp | 2 +- mlir/lib/Transforms/Utils/LoopUtils.cpp | 10 +- .../Analysis/TestMemRefDependenceCheck.cpp | 2 +- 12 files changed, 635 insertions(+), 402 deletions(-) diff --git a/mlir/include/mlir/Analysis/AffineAnalysis.h b/mlir/include/mlir/Analysis/AffineAnalysis.h index a0fae59939052..849d22e6938fb 100644 --- a/mlir/include/mlir/Analysis/AffineAnalysis.h +++ b/mlir/include/mlir/Analysis/AffineAnalysis.h @@ -25,7 +25,7 @@ namespace mlir { class AffineApplyOp; class AffineForOp; class AffineValueMap; -class FlatAffineConstraints; +class FlatAffineValueConstraints; class Operation; /// A description of a (parallelizable) reduction in an affine loop. @@ -67,7 +67,7 @@ void getReachableAffineApplyOps(ArrayRef operands, /// AffineIfOp. // TODO: handle non-unit strides. LogicalResult getIndexSet(MutableArrayRef ops, - FlatAffineConstraints *domain); + FlatAffineValueConstraints *domain); /// Encapsulates a memref load or store access information. struct MemRefAccess { @@ -136,7 +136,7 @@ struct DependenceResult { DependenceResult checkMemrefAccessDependence( const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, - unsigned loopDepth, FlatAffineConstraints *dependenceConstraints, + unsigned loopDepth, FlatAffineValueConstraints *dependenceConstraints, SmallVector *dependenceComponents, bool allowRAR = false); diff --git a/mlir/include/mlir/Analysis/AffineStructures.h b/mlir/include/mlir/Analysis/AffineStructures.h index e2f4c10e1078e..c97a2f6493eed 100644 --- a/mlir/include/mlir/Analysis/AffineStructures.h +++ b/mlir/include/mlir/Analysis/AffineStructures.h @@ -58,38 +58,34 @@ struct MutableAffineMap; /// class FlatAffineConstraints { public: + /// All derived classes of FlatAffineConstraints. + enum class Kind { FlatAffineConstraints, FlatAffineValueConstraints }; + + /// Kind of identifier (column). enum IdKind { Dimension, Symbol, Local }; /// Constructs a constraint system reserving memory for the specified number - /// of constraints and identifiers.. + /// of constraints and identifiers. FlatAffineConstraints(unsigned numReservedInequalities, unsigned numReservedEqualities, unsigned numReservedCols, unsigned numDims, - unsigned numSymbols, unsigned numLocals, - ArrayRef> idArgs = {}) + unsigned numSymbols, unsigned numLocals) : numIds(numDims + numSymbols + numLocals), numDims(numDims), numSymbols(numSymbols), equalities(0, numIds + 1, numReservedEqualities, numReservedCols), inequalities(0, numIds + 1, numReservedInequalities, numReservedCols) { assert(numReservedCols >= numIds + 1); - assert(idArgs.empty() || idArgs.size() == numIds); - ids.reserve(numReservedCols); - if (idArgs.empty()) - ids.resize(numIds, None); - else - ids.append(idArgs.begin(), idArgs.end()); } /// Constructs a constraint system with the specified number of /// dimensions and symbols. FlatAffineConstraints(unsigned numDims = 0, unsigned numSymbols = 0, - unsigned numLocals = 0, - ArrayRef> idArgs = {}) + unsigned numLocals = 0) : FlatAffineConstraints(/*numReservedInequalities=*/0, /*numReservedEqualities=*/0, /*numReservedCols=*/numDims + numSymbols + numLocals + 1, - numDims, numSymbols, numLocals, idArgs) {} + numDims, numSymbols, numLocals) {} /// Return a system with no constraints, i.e., one which is satisfied by all /// points. @@ -98,28 +94,27 @@ class FlatAffineConstraints { return FlatAffineConstraints(numDims, numSymbols); } - /// Create a flat affine constraint system from an AffineValueMap or a list of - /// these. The constructed system will only include equalities. - explicit FlatAffineConstraints(const AffineValueMap &avm); - explicit FlatAffineConstraints(ArrayRef avmRef); - /// Creates an affine constraint system from an IntegerSet. explicit FlatAffineConstraints(IntegerSet set); - FlatAffineConstraints(ArrayRef avmRef, - IntegerSet set); - FlatAffineConstraints(const MutableAffineMap &map); - ~FlatAffineConstraints() {} + virtual ~FlatAffineConstraints() = default; - // Clears any existing data and reserves memory for the specified constraints. - void reset(unsigned numReservedInequalities, unsigned numReservedEqualities, - unsigned numReservedCols, unsigned numDims, unsigned numSymbols, - unsigned numLocals = 0, ArrayRef idArgs = {}); + /// Return the kind of this FlatAffineConstraints. + virtual Kind getKind() const { return Kind::FlatAffineConstraints; } + + static bool classof(const FlatAffineConstraints *cst) { return true; } + + /// Clears any existing data and reserves memory for the specified + /// constraints. + virtual void reset(unsigned numReservedInequalities, + unsigned numReservedEqualities, unsigned numReservedCols, + unsigned numDims, unsigned numSymbols, + unsigned numLocals = 0); void reset(unsigned numDims = 0, unsigned numSymbols = 0, - unsigned numLocals = 0, ArrayRef idArgs = {}); + unsigned numLocals = 0); /// Appends constraints from 'other' into this. This is equivalent to an /// intersection with no simplification of any sort attempted. @@ -142,10 +137,10 @@ class FlatAffineConstraints { /// false if a solution exists. Uses the same algorithm as findIntegerSample. bool isIntegerEmpty() const; - // Returns a matrix where each row is a vector along which the polytope is - // bounded. The span of the returned vectors is guaranteed to contain all - // such vectors. The returned vectors are NOT guaranteed to be linearly - // independent. This function should not be called on empty sets. + /// Returns a matrix where each row is a vector along which the polytope is + /// bounded. The span of the returned vectors is guaranteed to contain all + /// such vectors. The returned vectors are NOT guaranteed to be linearly + /// independent. This function should not be called on empty sets. Matrix getBoundedDirections() const; /// Find an integer sample point satisfying the constraints using a @@ -160,7 +155,7 @@ class FlatAffineConstraints { /// otherwise. bool containsPoint(ArrayRef point) const; - // Clones this object. + /// Clones this object. std::unique_ptr clone() const; /// Returns the value at the specified equality row and column. @@ -198,42 +193,6 @@ class FlatAffineConstraints { return inequalities.getRow(idx); } - /// Adds constraints (lower and upper bounds) for the specified 'affine.for' - /// operation's Value using IR information stored in its bound maps. The - /// right identifier is first looked up using forOp's Value. Asserts if the - /// Value corresponding to the 'affine.for' operation isn't found in the - /// constraint system. Returns failure for the yet unimplemented/unsupported - /// cases. Any new identifiers that are found in the bound operands of the - /// 'affine.for' operation are added as trailing identifiers (either - /// dimensional or symbolic depending on whether the operand is a valid - /// symbol). - // TODO: add support for non-unit strides. - LogicalResult addAffineForOpDomain(AffineForOp forOp); - - /// Adds constraints (lower and upper bounds) for each loop in the loop nest - /// described by the bound maps 'lbMaps' and 'ubMaps' of a computation slice. - /// Every pair ('lbMaps[i]', 'ubMaps[i]') describes the bounds of a loop in - /// the nest, sorted outer-to-inner. 'operands' contains the bound operands - /// for a single bound map. All the bound maps will use the same bound - /// operands. Note that some loops described by a computation slice might not - /// exist yet in the IR so the Value attached to those dimension identifiers - /// might be empty. For that reason, this method doesn't perform Value - /// look-ups to retrieve the dimension identifier positions. Instead, it - /// assumes the position of the dim identifiers in the constraint system is - /// the same as the position of the loop in the loop nest. - LogicalResult addDomainFromSliceMaps(ArrayRef lbMaps, - ArrayRef ubMaps, - ArrayRef operands); - - /// Adds constraints imposed by the `affine.if` operation. These constraints - /// are collected from the IntegerSet attached to the given `affine.if` - /// instance argument (`ifOp`). It is asserted that: - /// 1) The IntegerSet of the given `affine.if` instance should not contain - /// semi-affine expressions, - /// 2) The columns of the constraint system created from `ifOp` should match - /// the columns in the current one regarding numbers and values. - void addAffineIfOpDomain(AffineIfOp ifOp); - /// Adds a lower or an upper bound for the identifier at the specified /// position with constraints being drawn from the specified bound map. If /// `eq` is true, add a single equality equal to the bound map's first result @@ -243,23 +202,6 @@ class FlatAffineConstraints { LogicalResult addLowerOrUpperBound(unsigned pos, AffineMap boundMap, bool eq, bool lower = true); - /// Adds a lower or an upper bound for the identifier at the specified - /// position with constraints being drawn from the specified bound map and - /// operands. If `eq` is true, add a single equality equal to the bound map's - /// first result expr. - LogicalResult addLowerOrUpperBound(unsigned pos, AffineMap boundMap, - ValueRange operands, bool eq, - bool lower = true); - - /// Returns the bound for the identifier at `pos` from the inequality at - /// `ineqPos` as a 1-d affine value map (affine map + operands). The returned - /// affine value map can either be a lower bound or an upper bound depending - /// on the sign of atIneq(ineqPos, pos). Asserts if the row at `ineqPos` does - /// not involve the `pos`th identifier. - void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos, - AffineValueMap &vmap, - MLIRContext *context) const; - /// Returns the constraint system as an integer set. Returns a null integer /// set if the system has no constraints, or if an integer set couldn't be /// constructed as a result of a local variable's explicit representation not @@ -276,20 +218,9 @@ class FlatAffineConstraints { SmallVectorImpl *lbMaps, SmallVectorImpl *ubMaps); - /// Adds slice lower bounds represented by lower bounds in 'lbMaps' and upper - /// bounds in 'ubMaps' to each identifier in the constraint system which has - /// a value in 'values'. Note that both lower/upper bounds share the same - /// operand list 'operands'. - /// This function assumes 'values.size' == 'lbMaps.size' == 'ubMaps.size'. - /// Note that both lower/upper bounds use operands from 'operands'. - LogicalResult addSliceBounds(ArrayRef values, - ArrayRef lbMaps, - ArrayRef ubMaps, - ArrayRef operands); - - // Adds an inequality (>= 0) from the coefficients specified in inEq. + /// Adds an inequality (>= 0) from the coefficients specified in inEq. void addInequality(ArrayRef inEq); - // Adds an equality from the coefficients specified in eq. + /// Adds an equality from the coefficients specified in eq. void addEquality(ArrayRef eq); /// Adds a constant lower bound constraint for the specified identifier. @@ -312,48 +243,16 @@ class FlatAffineConstraints { /// Sets the identifier at the specified position to a constant. void setIdToConstant(unsigned pos, int64_t val); - /// Sets the identifier corresponding to the specified Value id to a - /// constant. Asserts if the 'id' is not found. - void setIdToConstant(Value id, int64_t val); - - /// Looks up the position of the identifier with the specified Value. Returns - /// true if found (false otherwise). `pos' is set to the (column) position of - /// the identifier. - bool findId(Value id, unsigned *pos) const; - - /// Returns true if an identifier with the specified Value exists, false - /// otherwise. - bool containsId(Value id) const; - /// Swap the posA^th identifier with the posB^th identifier. - void swapId(unsigned posA, unsigned posB); - - // Add identifiers of the specified kind - specified positions are relative to - // the kind of identifier. The coefficient column corresponding to the added - // identifier is initialized to zero. 'id' is the Value corresponding to the - // identifier that can optionally be provided. - void addDimId(unsigned pos, Value id = nullptr); - void addSymbolId(unsigned pos, Value id = nullptr); - void addLocalId(unsigned pos); - void addId(IdKind kind, unsigned pos, Value id = nullptr); + virtual void swapId(unsigned posA, unsigned posB); - /// Add the specified values as a dim or symbol id depending on its nature, if - /// it already doesn't exist in the system. `id' has to be either a terminal - /// symbol or a loop IV, i.e., it cannot be the result affine.apply of any - /// symbols or loop IVs. The identifier is added to the end of the existing - /// dims or symbols. Additional information on the identifier is extracted - /// from the IR and added to the constraint system. - void addInductionVarOrTerminalSymbol(Value id); - - /// Composes the affine value map with this FlatAffineConstrains, adding the - /// results of the map as dimensions at the front [0, vMap->getNumResults()) - /// and with the dimensions set to the equalities specified by the value map. - /// Returns failure if the composition fails (when vMap is a semi-affine map). - /// The vMap's operand Value's are used to look up the right positions in - /// the FlatAffineConstraints with which to associate. Every operand of vMap - /// should have a matching dim/symbol column in this constraint system (with - /// the same associated Value). - LogicalResult composeMap(const AffineValueMap *vMap); + /// Add identifiers of the specified kind - specified positions are relative + /// to the kind of identifier. The coefficient column corresponding to the + /// added identifier is initialized to zero. + void addDimId(unsigned pos); + void addSymbolId(unsigned pos); + void addLocalId(unsigned pos); + virtual unsigned addId(IdKind kind, unsigned pos); /// Composes an affine map whose dimensions and symbols match one to one with /// the dimensions and symbols of this FlatAffineConstraints. The results of @@ -369,27 +268,21 @@ class FlatAffineConstraints { void projectOut(unsigned pos, unsigned num); inline void projectOut(unsigned pos) { return projectOut(pos, 1); } - /// Projects out the identifier that is associate with Value . - void projectOut(Value id); - /// Removes the specified identifier from the system. void removeId(unsigned pos); void removeEquality(unsigned pos); void removeInequality(unsigned pos); + /// Sets the values.size() identifiers starting at pos to the specified values + /// and removes them. + void setAndEliminate(unsigned pos, ArrayRef values); + /// Changes the partition between dimensions and symbols. Depending on the new /// symbol count, either a chunk of trailing dimensional identifiers becomes /// symbols, or some of the leading symbols become dimensions. void setDimSymbolSeparation(unsigned newSymbolCount); - /// Changes all symbol identifiers which are loop IVs to dim identifiers. - void convertLoopIVSymbolsToDims(); - - /// Sets the values.size() identifiers starting at pos to the specified values - /// and removes them. - void setAndEliminate(unsigned pos, ArrayRef values); - /// Tries to fold the specified identifier to a constant using a trivial /// equality detection; if successful, the constant is substituted for the /// identifier everywhere in the constraint system and then removed from the @@ -415,25 +308,6 @@ class FlatAffineConstraints { /// <= 15}, output = {0 <= d0 <= 6, 1 <= d1 <= 15}. LogicalResult unionBoundingBox(const FlatAffineConstraints &other); - /// Returns 'true' if this constraint system and 'other' are in the same - /// space, i.e., if they are associated with the same set of identifiers, - /// appearing in the same order. Returns 'false' otherwise. - bool areIdsAlignedWithOther(const FlatAffineConstraints &other); - - /// Merge and align the identifiers of 'this' and 'other' starting at - /// 'offset', so that both constraint systems get the union of the contained - /// identifiers that is dimension-wise and symbol-wise unique; both - /// constraint systems are updated so that they have the union of all - /// identifiers, with this's original identifiers appearing first followed by - /// any of other's identifiers that didn't appear in 'this'. Local - /// identifiers of each system are by design separate/local and are placed - /// one after other (this's followed by other's). - // Eg: Input: 'this' has ((%i %j) [%M %N]) - // 'other' has (%k, %j) [%P, %N, %M]) - // Output: both 'this', 'other' have (%i, %j, %k) [%M, %N, %P] - // - void mergeAndAlignIdsWithOther(unsigned offset, FlatAffineConstraints *other); - unsigned getNumConstraints() const { return getNumInequalities() + getNumEqualities(); } @@ -445,56 +319,8 @@ class FlatAffineConstraints { return numIds - numDims - numSymbols; } - inline ArrayRef> getIds() const { - return {ids.data(), ids.size()}; - } - inline MutableArrayRef> getIds() { - return {ids.data(), ids.size()}; - } - - /// Returns the optional Value corresponding to the pos^th identifier. - inline Optional getId(unsigned pos) const { return ids[pos]; } - inline Optional &getId(unsigned pos) { return ids[pos]; } - - /// Returns the Value associated with the pos^th identifier. Asserts if - /// no Value identifier was associated. - inline Value getIdValue(unsigned pos) const { - assert(ids[pos].hasValue() && "identifier's Value not set"); - return ids[pos].getValue(); - } - - /// Returns the Values associated with identifiers in range [start, end). - /// Asserts if no Value was associated with one of these identifiers. - void getIdValues(unsigned start, unsigned end, - SmallVectorImpl *values) const { - assert((start < numIds || start == end) && "invalid start position"); - assert(end <= numIds && "invalid end position"); - values->clear(); - values->reserve(end - start); - for (unsigned i = start; i < end; i++) { - values->push_back(getIdValue(i)); - } - } - inline void getAllIdValues(SmallVectorImpl *values) const { - getIdValues(0, numIds, values); - } - - /// Sets Value associated with the pos^th identifier. - inline void setIdValue(unsigned pos, Value val) { - assert(pos < numIds && "invalid id position"); - ids[pos] = val; - } - /// Sets Values associated with identifiers in the range [start, end). - void setIdValues(unsigned start, unsigned end, ArrayRef values) { - assert((start < numIds || end == start) && "invalid start position"); - assert(end <= numIds && "invalid end position"); - assert(values.size() == end - start); - for (unsigned i = start; i < end; ++i) - ids[i] = values[i - start]; - } - - /// Clears this list of constraints and copies other into it. - void clearAndCopyFrom(const FlatAffineConstraints &other); + /// Replaces the contents of this FlatAffineConstraints with `other`. + virtual void clearAndCopyFrom(const FlatAffineConstraints &other); /// Returns the smallest known constant bound for the extent of the specified /// identifier (pos^th), i.e., the smallest known constant that is greater @@ -575,17 +401,17 @@ class FlatAffineConstraints { /// O(VC) time. void removeRedundantConstraints(); - // Removes all equalities and inequalities. + /// Removes all equalities and inequalities. void clearConstraints(); void print(raw_ostream &os) const; void dump() const; -private: +protected: /// Returns false if the fields corresponding to various identifier counts, or /// equality/inequality buffer sizes aren't consistent; true otherwise. This /// is meant to be used within an assert internally. - bool hasConsistentState() const; + virtual bool hasConsistentState() const; /// Checks all rows of equality/inequality constraints for trivial /// contradictions (for example: 1 == 0, 0 >= 1), which may have surfaced @@ -598,10 +424,6 @@ class FlatAffineConstraints { template Optional computeConstantLowerOrUpperBound(unsigned pos); - /// Align `map` with this constraint system based on `operands`. Each operand - /// must already have a corresponding dim/symbol in this constraint system. - AffineMap computeAlignedMap(AffineMap map, ValueRange operands) const; - /// Given an affine map that is aligned with this constraint system: /// * Flatten the map. /// * Add newly introduced local columns at the beginning of this constraint @@ -615,16 +437,16 @@ class FlatAffineConstraints { LogicalResult flattenAlignedMapAndMergeLocals( AffineMap map, std::vector> *flattenedExprs); - // Eliminates a single identifier at 'position' from equality and inequality - // constraints. Returns 'success' if the identifier was eliminated, and - // 'failure' otherwise. + /// Eliminates a single identifier at 'position' from equality and inequality + /// constraints. Returns 'success' if the identifier was eliminated, and + /// 'failure' otherwise. inline LogicalResult gaussianEliminateId(unsigned position) { return success(gaussianEliminateIds(position, position + 1) == 1); } - // Eliminates identifiers from equality and inequality constraints - // in column range [posStart, posLimit). - // Returns the number of variables eliminated. + /// Eliminates identifiers from equality and inequality constraints + /// in column range [posStart, posLimit). + /// Returns the number of variables eliminated. unsigned gaussianEliminateIds(unsigned posStart, unsigned posLimit); /// Eliminates identifier at the specified position using Fourier-Motzkin @@ -634,8 +456,8 @@ class FlatAffineConstraints { /// set to true, a potential under approximation (subset) of the rational /// shadow / exact integer shadow is computed. // See implementation comments for more details. - void fourierMotzkinEliminate(unsigned pos, bool darkShadow = false, - bool *isResultIntegerExact = nullptr); + virtual void fourierMotzkinEliminate(unsigned pos, bool darkShadow = false, + bool *isResultIntegerExact = nullptr); /// Tightens inequalities given that we are dealing with integer spaces. This /// is similar to the GCD test but applied to inequalities. The constant term @@ -651,7 +473,7 @@ class FlatAffineConstraints { /// Removes identifiers in the column range [idStart, idLimit), and copies any /// remaining valid data into place, updates member variables, and resizes /// arrays as needed. - void removeIdRange(unsigned idStart, unsigned idLimit); + virtual void removeIdRange(unsigned idStart, unsigned idLimit); /// Total number of identifiers. unsigned numIds; @@ -669,12 +491,6 @@ class FlatAffineConstraints { /// Coefficients of affine inequalities (in >= 0 form). Matrix inequalities; - /// Values corresponding to the (column) identifiers of this constraint - /// system appearing in the order the identifiers correspond to columns. - /// Temporary ones or those that aren't associated to any Value are set to - /// None. - SmallVector, 8> ids; - /// A parameter that controls detection of an unrealistic number of /// constraints. If the number of constraints is this many times the number of /// variables, we consider such a system out of line with the intended use @@ -688,6 +504,318 @@ class FlatAffineConstraints { constexpr static unsigned kExplosionFactor = 32; }; +/// An extension of FlatAffineConstraints in which dimensions and symbols can +/// optionally be associated with an SSA value. +class FlatAffineValueConstraints : public FlatAffineConstraints { +public: + /// Constructs a constraint system reserving memory for the specified number + /// of constraints and identifiers. + FlatAffineValueConstraints(unsigned numReservedInequalities, + unsigned numReservedEqualities, + unsigned numReservedCols, unsigned numDims, + unsigned numSymbols, unsigned numLocals, + ArrayRef> idArgs = {}) + : FlatAffineConstraints(numReservedInequalities, numReservedEqualities, + numReservedCols, numDims, numSymbols, numLocals) { + assert(numReservedCols >= numIds + 1); + assert(idArgs.empty() || idArgs.size() == numIds); + ids.reserve(numReservedCols); + if (idArgs.empty()) + ids.resize(numIds, None); + else + ids.append(idArgs.begin(), idArgs.end()); + } + + /// Constructs a constraint system with the specified number of + /// dimensions and symbols. + FlatAffineValueConstraints(unsigned numDims = 0, unsigned numSymbols = 0, + unsigned numLocals = 0, + ArrayRef> idArgs = {}) + : FlatAffineValueConstraints(/*numReservedInequalities=*/0, + /*numReservedEqualities=*/0, + /*numReservedCols=*/numDims + numSymbols + + numLocals + 1, + numDims, numSymbols, numLocals, idArgs) {} + + /// Create a flat affine constraint system from an AffineValueMap or a list of + /// these. The constructed system will only include equalities. + explicit FlatAffineValueConstraints(const AffineValueMap &avm); + explicit FlatAffineValueConstraints(ArrayRef avmRef); + + /// Creates an affine constraint system from an IntegerSet. + explicit FlatAffineValueConstraints(IntegerSet set); + + FlatAffineValueConstraints(ArrayRef avmRef, + IntegerSet set); + + /// Return the kind of this FlatAffineConstraints. + Kind getKind() const override { return Kind::FlatAffineValueConstraints; } + + static bool classof(const FlatAffineConstraints *cst) { + return cst->getKind() == Kind::FlatAffineValueConstraints; + } + + /// Clears any existing data and reserves memory for the specified + /// constraints. + void reset(unsigned numReservedInequalities, unsigned numReservedEqualities, + unsigned numReservedCols, unsigned numDims, unsigned numSymbols, + unsigned numLocals = 0) override; + void reset(unsigned numReservedInequalities, unsigned numReservedEqualities, + unsigned numReservedCols, unsigned numDims, unsigned numSymbols, + unsigned numLocals, ArrayRef idArgs); + void reset(unsigned numDims, unsigned numSymbols, unsigned numLocals, + ArrayRef idArgs); + using FlatAffineConstraints::reset; + + /// Clones this object. + std::unique_ptr clone() const; + + /// Adds constraints (lower and upper bounds) for the specified 'affine.for' + /// operation's Value using IR information stored in its bound maps. The + /// right identifier is first looked up using forOp's Value. Asserts if the + /// Value corresponding to the 'affine.for' operation isn't found in the + /// constraint system. Returns failure for the yet unimplemented/unsupported + /// cases. Any new identifiers that are found in the bound operands of the + /// 'affine.for' operation are added as trailing identifiers (either + /// dimensional or symbolic depending on whether the operand is a valid + /// symbol). + // TODO: add support for non-unit strides. + LogicalResult addAffineForOpDomain(AffineForOp forOp); + + /// Adds constraints (lower and upper bounds) for each loop in the loop nest + /// described by the bound maps 'lbMaps' and 'ubMaps' of a computation slice. + /// Every pair ('lbMaps[i]', 'ubMaps[i]') describes the bounds of a loop in + /// the nest, sorted outer-to-inner. 'operands' contains the bound operands + /// for a single bound map. All the bound maps will use the same bound + /// operands. Note that some loops described by a computation slice might not + /// exist yet in the IR so the Value attached to those dimension identifiers + /// might be empty. For that reason, this method doesn't perform Value + /// look-ups to retrieve the dimension identifier positions. Instead, it + /// assumes the position of the dim identifiers in the constraint system is + /// the same as the position of the loop in the loop nest. + LogicalResult addDomainFromSliceMaps(ArrayRef lbMaps, + ArrayRef ubMaps, + ArrayRef operands); + + /// Adds constraints imposed by the `affine.if` operation. These constraints + /// are collected from the IntegerSet attached to the given `affine.if` + /// instance argument (`ifOp`). It is asserted that: + /// 1) The IntegerSet of the given `affine.if` instance should not contain + /// semi-affine expressions, + /// 2) The columns of the constraint system created from `ifOp` should match + /// the columns in the current one regarding numbers and values. + void addAffineIfOpDomain(AffineIfOp ifOp); + + /// Adds a lower or an upper bound for the identifier at the specified + /// position with constraints being drawn from the specified bound map and + /// operands. If `eq` is true, add a single equality equal to the bound map's + /// first result expr. + LogicalResult addLowerOrUpperBound(unsigned pos, AffineMap boundMap, + ValueRange operands, bool eq, + bool lower = true); + using FlatAffineConstraints::addLowerOrUpperBound; + + /// Returns the bound for the identifier at `pos` from the inequality at + /// `ineqPos` as a 1-d affine value map (affine map + operands). The returned + /// affine value map can either be a lower bound or an upper bound depending + /// on the sign of atIneq(ineqPos, pos). Asserts if the row at `ineqPos` does + /// not involve the `pos`th identifier. + void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos, + AffineValueMap &vmap, + MLIRContext *context) const; + + /// Adds slice lower bounds represented by lower bounds in 'lbMaps' and upper + /// bounds in 'ubMaps' to each identifier in the constraint system which has + /// a value in 'values'. Note that both lower/upper bounds share the same + /// operand list 'operands'. + /// This function assumes 'values.size' == 'lbMaps.size' == 'ubMaps.size'. + /// Note that both lower/upper bounds use operands from 'operands'. + LogicalResult addSliceBounds(ArrayRef values, + ArrayRef lbMaps, + ArrayRef ubMaps, + ArrayRef operands); + + /// Sets the identifier corresponding to the specified Value id to a + /// constant. Asserts if the 'id' is not found. + void setIdToConstant(Value id, int64_t val); + using FlatAffineConstraints::setIdToConstant; + + /// Looks up the position of the identifier with the specified Value. Returns + /// true if found (false otherwise). `pos' is set to the (column) position of + /// the identifier. + bool findId(Value id, unsigned *pos) const; + + /// Returns true if an identifier with the specified Value exists, false + /// otherwise. + bool containsId(Value id) const; + + /// Swap the posA^th identifier with the posB^th identifier. + void swapId(unsigned posA, unsigned posB) override; + + /// Add identifiers of the specified kind - specified positions are relative + /// to the kind of identifier. The coefficient column corresponding to the + /// added identifier is initialized to zero. 'id' is the Value corresponding + /// to the identifier that can optionally be provided. + void addDimId(unsigned pos, Value id); + using FlatAffineConstraints::addDimId; + void addSymbolId(unsigned pos, Value id); + using FlatAffineConstraints::addSymbolId; + unsigned addId(IdKind kind, unsigned pos) override; + unsigned addId(IdKind kind, unsigned pos, Value id); + + /// Add the specified values as a dim or symbol id depending on its nature, if + /// it already doesn't exist in the system. `id' has to be either a terminal + /// symbol or a loop IV, i.e., it cannot be the result affine.apply of any + /// symbols or loop IVs. The identifier is added to the end of the existing + /// dims or symbols. Additional information on the identifier is extracted + /// from the IR and added to the constraint system. + void addInductionVarOrTerminalSymbol(Value id); + + /// Align `map` with this constraint system based on `operands`. Each operand + /// must already have a corresponding dim/symbol in this constraint system. + AffineMap computeAlignedMap(AffineMap map, ValueRange operands) const; + + /// Composes the affine value map with this FlatAffineValueConstrains, adding + /// the results of the map as dimensions at the front + /// [0, vMap->getNumResults()) and with the dimensions set to the equalities + /// specified by the value map. + /// + /// Returns failure if the composition fails (when vMap is a semi-affine map). + /// The vMap's operand Value's are used to look up the right positions in + /// the FlatAffineConstraints with which to associate. Every operand of vMap + /// should have a matching dim/symbol column in this constraint system (with + /// the same associated Value). + LogicalResult composeMap(const AffineValueMap *vMap); + + /// Projects out the identifier that is associate with Value. + void projectOut(Value id); + using FlatAffineConstraints::projectOut; + + /// Changes all symbol identifiers which are loop IVs to dim identifiers. + void convertLoopIVSymbolsToDims(); + + /// Updates the constraints to be the smallest bounding (enclosing) box that + /// contains the points of 'this' set and that of 'other', with the symbols + /// being treated specially. For each of the dimensions, the min of the lower + /// bounds (symbolic) and the max of the upper bounds (symbolic) is computed + /// to determine such a bounding box. `other' is expected to have the same + /// dimensional identifiers as this constraint system (in the same order). + /// + /// Eg: if 'this' is {0 <= d0 <= 127}, 'other' is {16 <= d0 <= 192}, the + /// output is {0 <= d0 <= 192}. + /// 2) 'this' = {s0 + 5 <= d0 <= s0 + 20}, 'other' is {s0 + 1 <= d0 <= s0 + + /// 9}, output = {s0 + 1 <= d0 <= s0 + 20}. + /// 3) 'this' = {0 <= d0 <= 5, 1 <= d1 <= 9}, 'other' = {2 <= d0 <= 6, 5 <= d1 + /// <= 15}, output = {0 <= d0 <= 6, 1 <= d1 <= 15}. + LogicalResult unionBoundingBox(const FlatAffineValueConstraints &other); + using FlatAffineConstraints::unionBoundingBox; + + /// Merge and align the identifiers of 'this' and 'other' starting at + /// 'offset', so that both constraint systems get the union of the contained + /// identifiers that is dimension-wise and symbol-wise unique; both + /// constraint systems are updated so that they have the union of all + /// identifiers, with this's original identifiers appearing first followed by + /// any of other's identifiers that didn't appear in 'this'. Local + /// identifiers of each system are by design separate/local and are placed + /// one after other (this's followed by other's). + // Eg: Input: 'this' has ((%i %j) [%M %N]) + // 'other' has (%k, %j) [%P, %N, %M]) + // Output: both 'this', 'other' have (%i, %j, %k) [%M, %N, %P] + // + void mergeAndAlignIdsWithOther(unsigned offset, + FlatAffineValueConstraints *other); + + /// Returns 'true' if this constraint system and 'other' are in the same + /// space, i.e., if they are associated with the same set of identifiers, + /// appearing in the same order. Returns 'false' otherwise. + bool areIdsAlignedWithOther(const FlatAffineValueConstraints &other); + + /// Replaces the contents of this FlatAffineValueConstraints with `other`. + void clearAndCopyFrom(const FlatAffineConstraints &other) override; + + inline ArrayRef> getIds() const { + return {ids.data(), ids.size()}; + } + inline MutableArrayRef> getIds() { + return {ids.data(), ids.size()}; + } + + /// Returns the optional Value corresponding to the pos^th identifier. + inline Optional getId(unsigned pos) const { return ids[pos]; } + inline Optional &getId(unsigned pos) { return ids[pos]; } + + /// Returns the Value associated with the pos^th identifier. Asserts if + /// no Value identifier was associated. + inline Value getIdValue(unsigned pos) const { + assert(hasIdValue(pos) && "identifier's Value not set"); + return ids[pos].getValue(); + } + + /// Returns true if the pos^th identifier has an associated Value. + inline bool hasIdValue(unsigned pos) const { return ids[pos].hasValue(); } + + /// Returns true if at least one identifier has an associated Value. + bool hasIdValues() const; + + /// Returns the Values associated with identifiers in range [start, end). + /// Asserts if no Value was associated with one of these identifiers. + void getIdValues(unsigned start, unsigned end, + SmallVectorImpl *values) const { + assert((start < numIds || start == end) && "invalid start position"); + assert(end <= numIds && "invalid end position"); + values->clear(); + values->reserve(end - start); + for (unsigned i = start; i < end; i++) { + values->push_back(getIdValue(i)); + } + } + inline void getAllIdValues(SmallVectorImpl *values) const { + getIdValues(0, numIds, values); + } + + /// Sets Value associated with the pos^th identifier. + inline void setIdValue(unsigned pos, Value val) { + assert(pos < numIds && "invalid id position"); + ids[pos] = val; + } + + /// Sets Values associated with identifiers in the range [start, end). + void setIdValues(unsigned start, unsigned end, ArrayRef values) { + assert((start < numIds || end == start) && "invalid start position"); + assert(end <= numIds && "invalid end position"); + assert(values.size() == end - start); + for (unsigned i = start; i < end; ++i) + ids[i] = values[i - start]; + } + +protected: + /// Returns false if the fields corresponding to various identifier counts, or + /// equality/inequality buffer sizes aren't consistent; true otherwise. This + /// is meant to be used within an assert internally. + bool hasConsistentState() const override; + + /// Removes identifiers in the column range [idStart, idLimit), and copies any + /// remaining valid data into place, updates member variables, and resizes + /// arrays as needed. + void removeIdRange(unsigned idStart, unsigned idLimit) override; + + /// Eliminates identifier at the specified position using Fourier-Motzkin + /// variable elimination, but uses Gaussian elimination if there is an + /// equality involving that identifier. If the result of the elimination is + /// integer exact, *isResultIntegerExact is set to true. If 'darkShadow' is + /// set to true, a potential under approximation (subset) of the rational + /// shadow / exact integer shadow is computed. + // See implementation comments for more details. + void fourierMotzkinEliminate(unsigned pos, bool darkShadow = false, + bool *isResultIntegerExact = nullptr) override; + + /// Values corresponding to the (column) identifiers of this constraint + /// system appearing in the order the identifiers correspond to columns. + /// Temporary ones or those that aren't associated to any Value are set to + /// None. + SmallVector, 8> ids; +}; + /// Flattens 'expr' into 'flattenedExpr', which contains the coefficients of the /// dimensions, symbols, and additional variables that represent floor divisions /// of dimensions, symbols, and in turn other floor divisions. Returns failure diff --git a/mlir/include/mlir/Analysis/Utils.h b/mlir/include/mlir/Analysis/Utils.h index 89b73dace9b7d..d1d2218b8c019 100644 --- a/mlir/include/mlir/Analysis/Utils.h +++ b/mlir/include/mlir/Analysis/Utils.h @@ -28,7 +28,6 @@ namespace mlir { class AffineForOp; class Block; -class FlatAffineConstraints; class Location; struct MemRefAccess; class Operation; @@ -93,13 +92,13 @@ struct ComputationSliceState { // Constraints are added for all loop IV bounds (dim or symbol), and // constraints are added for slice bounds in 'lbs'/'ubs'. // Returns failure if we cannot add loop bounds because of unsupported cases. - LogicalResult getAsConstraints(FlatAffineConstraints *cst); + LogicalResult getAsConstraints(FlatAffineValueConstraints *cst); /// Adds to 'cst' constraints which represent the original loop bounds on /// 'ivs' in 'this'. This corresponds to the original domain of the loop nest /// from which the slice is being computed. Returns failure if we cannot add /// loop bounds because of unsupported cases. - LogicalResult getSourceAsConstraints(FlatAffineConstraints &cst); + LogicalResult getSourceAsConstraints(FlatAffineValueConstraints &cst); // Clears all bounds and operands in slice state. void clearBounds(); @@ -183,7 +182,7 @@ struct ComputationSliceState { // } // void getComputationSliceState(Operation *depSourceOp, Operation *depSinkOp, - FlatAffineConstraints *dependenceConstraints, + FlatAffineValueConstraints *dependenceConstraints, unsigned loopDepth, bool isBackwardSlice, ComputationSliceState *sliceState); @@ -243,7 +242,7 @@ AffineForOp insertBackwardComputationSlice(Operation *srcOpInst, // } // // Region: {memref = %A, write = false, {%i <= m0 <= %i + 7} } -// The last field is a 2-d FlatAffineConstraints symbolic in %i. +// The last field is a 2-d FlatAffineValueConstraints symbolic in %i. // struct MemRefRegion { explicit MemRefRegion(Location loc) : loc(loc) {} @@ -278,14 +277,14 @@ struct MemRefRegion { /// } /// /// {memref = %A, write = false, {%i <= m0 <= %i + 7} } - /// The last field is a 2-d FlatAffineConstraints symbolic in %i. + /// The last field is a 2-d FlatAffineValueConstraints symbolic in %i. /// LogicalResult compute(Operation *op, unsigned loopDepth, const ComputationSliceState *sliceState = nullptr, bool addMemRefDimBounds = true); - FlatAffineConstraints *getConstraints() { return &cst; } - const FlatAffineConstraints *getConstraints() const { return &cst; } + FlatAffineValueConstraints *getConstraints() { return &cst; } + const FlatAffineValueConstraints *getConstraints() const { return &cst; } bool isWrite() const { return write; } void setWrite(bool flag) { write = flag; } @@ -309,10 +308,10 @@ struct MemRefRegion { void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, AffineMap &ubMap) const; - /// A wrapper around FlatAffineConstraints::getConstantBoundOnDimSize(). 'pos' - /// corresponds to the position of the memref shape's dimension (major to - /// minor) which matches 1:1 with the dimensional identifier positions in - //'cst'. + /// A wrapper around FlatAffineValueConstraints::getConstantBoundOnDimSize(). + /// 'pos' corresponds to the position of the memref shape's dimension (major + /// to minor) which matches 1:1 with the dimensional identifier positions in + /// 'cst'. Optional getConstantBoundOnDimSize(unsigned pos, SmallVectorImpl *lb = nullptr, @@ -324,7 +323,7 @@ struct MemRefRegion { /// Returns the size of this MemRefRegion in bytes. Optional getRegionSize(); - // Wrapper around FlatAffineConstraints::unionBoundingBox. + // Wrapper around FlatAffineValueConstraints::unionBoundingBox. LogicalResult unionBoundingBox(const MemRefRegion &other); /// Returns the rank of the memref that this region corresponds to. @@ -348,7 +347,7 @@ struct MemRefRegion { /// and thus the region is symbolic in the outer surrounding loops at that /// depth. // TODO: Replace this to exploit HyperRectangularSet. - FlatAffineConstraints cst; + FlatAffineValueConstraints cst; }; /// Returns the size of memref data in bytes if it's statically shaped, None diff --git a/mlir/include/mlir/Dialect/Linalg/Analysis/ConstraintsSet.h b/mlir/include/mlir/Dialect/Linalg/Analysis/ConstraintsSet.h index 32ffd97235eac..4cb381b0f2177 100644 --- a/mlir/include/mlir/Dialect/Linalg/Analysis/ConstraintsSet.h +++ b/mlir/include/mlir/Dialect/Linalg/Analysis/ConstraintsSet.h @@ -1,4 +1,4 @@ -//===- ConstraintsSet.h - Extensions for FlatAffineConstraints --*- C++ -*-===// +//===- ConstraintsSet.h - Ext. for FlatAffineValueConstraints ---*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -20,9 +20,9 @@ namespace mlir { class ValueRange; /// Linalg-specific constraints set extensions. -class ConstraintsSet : public FlatAffineConstraints { +class ConstraintsSet : public FlatAffineValueConstraints { public: - ConstraintsSet() : FlatAffineConstraints() {} + ConstraintsSet() : FlatAffineValueConstraints() {} /// Assuming `val` is defined by `val = affine.min map (operands)`, introduce /// all the constraints `val <= expr_i(operands)`, where expr_i are all the diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp index 18e6430c63e52..293bcf8c0dd94 100644 --- a/mlir/lib/Analysis/AffineAnalysis.cpp +++ b/mlir/lib/Analysis/AffineAnalysis.cpp @@ -157,7 +157,7 @@ bool mlir::isLoopMemoryParallel(AffineForOp forOp) { MemRefAccess srcAccess(srcOp); for (auto *dstOp : loadAndStoreOps) { MemRefAccess dstAccess(dstOp); - FlatAffineConstraints dependenceConstraints; + FlatAffineValueConstraints dependenceConstraints; DependenceResult result = checkMemrefAccessDependence( srcAccess, dstAccess, depth, &dependenceConstraints, /*dependenceComponents=*/nullptr); @@ -220,11 +220,11 @@ void mlir::getReachableAffineApplyOps( // the bound operands are added as symbols in the system. Returns failure for // the yet unimplemented cases. // TODO: Handle non-unit steps through local variables or stride information in -// FlatAffineConstraints. (For eg., by using iv - lb % step = 0 and/or by -// introducing a method in FlatAffineConstraints setExprStride(ArrayRef -// expr, int64_t stride) +// FlatAffineValueConstraints. (For eg., by using iv - lb % step = 0 and/or by +// introducing a method in FlatAffineValueConstraints +// setExprStride(ArrayRef expr, int64_t stride) LogicalResult mlir::getIndexSet(MutableArrayRef ops, - FlatAffineConstraints *domain) { + FlatAffineValueConstraints *domain) { SmallVector indices; SmallVector forOps; @@ -255,7 +255,7 @@ LogicalResult mlir::getIndexSet(MutableArrayRef ops, /// 'indexSet' correspond to the loops surrounding 'op' from outermost to /// innermost. static LogicalResult getOpIndexSet(Operation *op, - FlatAffineConstraints *indexSet) { + FlatAffineValueConstraints *indexSet) { SmallVector ops; getEnclosingAffineForAndIfOps(*op, &ops); return getIndexSet(ops, indexSet); @@ -352,10 +352,11 @@ class ValuePositionMap { // 'srcAccessMap'/'dstAccessMap' (as well as those in 'srcDomain'/'dstDomain') // to the position of these values in the merged list. static void buildDimAndSymbolPositionMaps( - const FlatAffineConstraints &srcDomain, - const FlatAffineConstraints &dstDomain, const AffineValueMap &srcAccessMap, - const AffineValueMap &dstAccessMap, ValuePositionMap *valuePosMap, - FlatAffineConstraints *dependenceConstraints) { + const FlatAffineValueConstraints &srcDomain, + const FlatAffineValueConstraints &dstDomain, + const AffineValueMap &srcAccessMap, const AffineValueMap &dstAccessMap, + ValuePositionMap *valuePosMap, + FlatAffineValueConstraints *dependenceConstraints) { // IsDimState is a tri-state boolean. It is used to distinguish three // different cases of the values passed to updateValuePosMap. @@ -437,13 +438,15 @@ static void buildDimAndSymbolPositionMaps( // Sets up dependence constraints columns appropriately, in the format: // [src-dim-ids, dst-dim-ids, symbol-ids, local-ids, const_term] -static void initDependenceConstraints( - const FlatAffineConstraints &srcDomain, - const FlatAffineConstraints &dstDomain, const AffineValueMap &srcAccessMap, - const AffineValueMap &dstAccessMap, const ValuePositionMap &valuePosMap, - FlatAffineConstraints *dependenceConstraints) { +static void +initDependenceConstraints(const FlatAffineValueConstraints &srcDomain, + const FlatAffineValueConstraints &dstDomain, + const AffineValueMap &srcAccessMap, + const AffineValueMap &dstAccessMap, + const ValuePositionMap &valuePosMap, + FlatAffineValueConstraints *dependenceConstraints) { // Calculate number of equalities/inequalities and columns required to - // initialize FlatAffineConstraints for 'dependenceDomain'. + // initialize FlatAffineValueConstraints for 'dependenceDomain'. unsigned numIneq = srcDomain.getNumInequalities() + dstDomain.getNumInequalities(); AffineMap srcMap = srcAccessMap.getAffineMap(); @@ -507,16 +510,16 @@ static void initDependenceConstraints( // 'dependenceDomain'. // Uses 'valuePosMap' to determine the position in 'dependenceDomain' to which a // srcDomain/dstDomain Value maps. -static void addDomainConstraints(const FlatAffineConstraints &srcDomain, - const FlatAffineConstraints &dstDomain, +static void addDomainConstraints(const FlatAffineValueConstraints &srcDomain, + const FlatAffineValueConstraints &dstDomain, const ValuePositionMap &valuePosMap, - FlatAffineConstraints *dependenceDomain) { + FlatAffineValueConstraints *dependenceDomain) { unsigned depNumDimsAndSymbolIds = dependenceDomain->getNumDimAndSymbolIds(); SmallVector cst(dependenceDomain->getNumCols()); auto addDomain = [&](bool isSrc, bool isEq, unsigned localOffset) { - const FlatAffineConstraints &domain = isSrc ? srcDomain : dstDomain; + const FlatAffineValueConstraints &domain = isSrc ? srcDomain : dstDomain; unsigned numCsts = isEq ? domain.getNumEqualities() : domain.getNumInequalities(); unsigned numDimAndSymbolIds = domain.getNumDimAndSymbolIds(); @@ -587,7 +590,7 @@ static LogicalResult addMemRefAccessConstraints(const AffineValueMap &srcAccessMap, const AffineValueMap &dstAccessMap, const ValuePositionMap &valuePosMap, - FlatAffineConstraints *dependenceDomain) { + FlatAffineValueConstraints *dependenceDomain) { AffineMap srcMap = srcAccessMap.getAffineMap(); AffineMap dstMap = dstAccessMap.getAffineMap(); assert(srcMap.getNumResults() == dstMap.getNumResults()); @@ -601,7 +604,7 @@ addMemRefAccessConstraints(const AffineValueMap &srcAccessMap, std::vector> srcFlatExprs; std::vector> destFlatExprs; - FlatAffineConstraints srcLocalVarCst, destLocalVarCst; + FlatAffineValueConstraints srcLocalVarCst, destLocalVarCst; // Get flattened expressions for the source destination maps. if (failed(getFlattenedAffineExprs(srcMap, &srcFlatExprs, &srcLocalVarCst)) || failed(getFlattenedAffineExprs(dstMap, &destFlatExprs, &destLocalVarCst))) @@ -716,8 +719,8 @@ addMemRefAccessConstraints(const AffineValueMap &srcAccessMap, // Returns the number of outer loop common to 'src/dstDomain'. // Loops common to 'src/dst' domains are added to 'commonLoops' if non-null. static unsigned -getNumCommonLoops(const FlatAffineConstraints &srcDomain, - const FlatAffineConstraints &dstDomain, +getNumCommonLoops(const FlatAffineValueConstraints &srcDomain, + const FlatAffineValueConstraints &dstDomain, SmallVectorImpl *commonLoops = nullptr) { // Find the number of common loops shared by src and dst accesses. unsigned minNumLoops = @@ -740,7 +743,7 @@ getNumCommonLoops(const FlatAffineConstraints &srcDomain, /// Returns Block common to 'srcAccess.opInst' and 'dstAccess.opInst'. static Block *getCommonBlock(const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, - const FlatAffineConstraints &srcDomain, + const FlatAffineValueConstraints &srcDomain, unsigned numCommonLoops) { // Get the chain of ancestor blocks to the given `MemRefAccess` instance. The // search terminates when either an op with the `AffineScope` trait or @@ -791,7 +794,7 @@ static Block *getCommonBlock(const MemRefAccess &srcAccess, // 'numCommonLoops' is the number of contiguous surrounding outer loops. static bool srcAppearsBeforeDstInAncestralBlock( const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, - const FlatAffineConstraints &srcDomain, unsigned numCommonLoops) { + const FlatAffineValueConstraints &srcDomain, unsigned numCommonLoops) { // Get Block common to 'srcAccess.opInst' and 'dstAccess.opInst'. auto *commonBlock = getCommonBlock(srcAccess, dstAccess, srcDomain, numCommonLoops); @@ -813,10 +816,11 @@ static bool srcAppearsBeforeDstInAncestralBlock( // *) If 'loopDepth == 1' then one constraint is added: i' >= i + 1 // *) If 'loopDepth == 2' then two constraints are added: i == i' and j' > j + 1 // *) If 'loopDepth == 3' then two constraints are added: i == i' and j == j' -static void addOrderingConstraints(const FlatAffineConstraints &srcDomain, - const FlatAffineConstraints &dstDomain, - unsigned loopDepth, - FlatAffineConstraints *dependenceDomain) { +static void +addOrderingConstraints(const FlatAffineValueConstraints &srcDomain, + const FlatAffineValueConstraints &dstDomain, + unsigned loopDepth, + FlatAffineValueConstraints *dependenceDomain) { unsigned numCols = dependenceDomain->getNumCols(); SmallVector eq(numCols); unsigned numSrcDims = srcDomain.getNumDimIds(); @@ -840,9 +844,9 @@ static void addOrderingConstraints(const FlatAffineConstraints &srcDomain, // eliminating all other variables, and reading off distance vectors from // equality constraints (if possible), and direction vectors from inequalities. static void computeDirectionVector( - const FlatAffineConstraints &srcDomain, - const FlatAffineConstraints &dstDomain, unsigned loopDepth, - FlatAffineConstraints *dependenceDomain, + const FlatAffineValueConstraints &srcDomain, + const FlatAffineValueConstraints &dstDomain, unsigned loopDepth, + FlatAffineValueConstraints *dependenceDomain, SmallVector *dependenceComponents) { // Find the number of common loops shared by src and dst accesses. SmallVector commonLoops; @@ -996,7 +1000,7 @@ void MemRefAccess::getAccessMap(AffineValueMap *accessMap) const { // TODO: Support AffineExprs mod/floordiv/ceildiv. DependenceResult mlir::checkMemrefAccessDependence( const MemRefAccess &srcAccess, const MemRefAccess &dstAccess, - unsigned loopDepth, FlatAffineConstraints *dependenceConstraints, + unsigned loopDepth, FlatAffineValueConstraints *dependenceConstraints, SmallVector *dependenceComponents, bool allowRAR) { LLVM_DEBUG(llvm::dbgs() << "Checking for dependence at depth: " << Twine(loopDepth) << " between:\n";); @@ -1022,12 +1026,12 @@ DependenceResult mlir::checkMemrefAccessDependence( dstAccess.getAccessMap(&dstAccessMap); // Get iteration domain for the 'srcAccess' operation. - FlatAffineConstraints srcDomain; + FlatAffineValueConstraints srcDomain; if (failed(getOpIndexSet(srcAccess.opInst, &srcDomain))) return DependenceResult::Failure; // Get iteration domain for 'dstAccess' operation. - FlatAffineConstraints dstDomain; + FlatAffineValueConstraints dstDomain; if (failed(getOpIndexSet(dstAccess.opInst, &dstDomain))) return DependenceResult::Failure; @@ -1106,7 +1110,7 @@ void mlir::getDependenceComponents( auto *dstOp = loadAndStoreOps[j]; MemRefAccess dstAccess(dstOp); - FlatAffineConstraints dependenceConstraints; + FlatAffineValueConstraints dependenceConstraints; SmallVector depComps; // TODO: Explore whether it would be profitable to pre-compute and store // deps instead of repeatedly checking. diff --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp index 984500e94dbd8..d36893bca61c6 100644 --- a/mlir/lib/Analysis/AffineStructures.cpp +++ b/mlir/lib/Analysis/AffineStructures.cpp @@ -141,7 +141,7 @@ LogicalResult mlir::getFlattenedAffineExprs( } //===----------------------------------------------------------------------===// -// FlatAffineConstraints. +// FlatAffineConstraints / FlatAffineValueConstraints. //===----------------------------------------------------------------------===// // Clones this object. @@ -149,14 +149,17 @@ std::unique_ptr FlatAffineConstraints::clone() const { return std::make_unique(*this); } +std::unique_ptr +FlatAffineValueConstraints::clone() const { + return std::make_unique(*this); +} + // Construct from an IntegerSet. FlatAffineConstraints::FlatAffineConstraints(IntegerSet set) : numIds(set.getNumDims() + set.getNumSymbols()), numDims(set.getNumDims()), numSymbols(set.getNumSymbols()), equalities(0, numIds + 1, set.getNumEqualities(), numIds + 1), inequalities(0, numIds + 1, set.getNumInequalities(), numIds + 1) { - ids.resize(numIds, None); - // Flatten expressions and add them to the constraint system. std::vector> flatExprs; FlatAffineConstraints localVarCst; @@ -182,26 +185,59 @@ FlatAffineConstraints::FlatAffineConstraints(IntegerSet set) append(localVarCst); } +// Construct from an IntegerSet. +FlatAffineValueConstraints::FlatAffineValueConstraints(IntegerSet set) + : FlatAffineConstraints(set) { + ids.resize(numIds, None); +} + void FlatAffineConstraints::reset(unsigned numReservedInequalities, unsigned numReservedEqualities, unsigned newNumReservedCols, unsigned newNumDims, unsigned newNumSymbols, - unsigned newNumLocals, - ArrayRef idArgs) { + unsigned newNumLocals) { + assert(newNumReservedCols >= newNumDims + newNumSymbols + newNumLocals + 1 && + "minimum 1 column"); + *this = FlatAffineConstraints(numReservedInequalities, numReservedEqualities, + newNumReservedCols, newNumDims, newNumSymbols, + newNumLocals); +} + +void FlatAffineValueConstraints::reset(unsigned numReservedInequalities, + unsigned numReservedEqualities, + unsigned newNumReservedCols, + unsigned newNumDims, + unsigned newNumSymbols, + unsigned newNumLocals) { + reset(numReservedInequalities, numReservedEqualities, newNumReservedCols, + newNumDims, newNumSymbols, newNumLocals, /*idArgs=*/{}); +} + +void FlatAffineValueConstraints::reset( + unsigned numReservedInequalities, unsigned numReservedEqualities, + unsigned newNumReservedCols, unsigned newNumDims, unsigned newNumSymbols, + unsigned newNumLocals, ArrayRef idArgs) { assert(newNumReservedCols >= newNumDims + newNumSymbols + newNumLocals + 1 && "minimum 1 column"); SmallVector, 8> newIds; if (!idArgs.empty()) newIds.assign(idArgs.begin(), idArgs.end()); - *this = FlatAffineConstraints(numReservedInequalities, numReservedEqualities, - newNumReservedCols, newNumDims, newNumSymbols, - newNumLocals, newIds); + *this = FlatAffineValueConstraints( + numReservedInequalities, numReservedEqualities, newNumReservedCols, + newNumDims, newNumSymbols, newNumLocals, newIds); } void FlatAffineConstraints::reset(unsigned newNumDims, unsigned newNumSymbols, - unsigned newNumLocals, - ArrayRef idArgs) { + unsigned newNumLocals) { + reset(0, 0, newNumDims + newNumSymbols + newNumLocals + 1, newNumDims, + newNumSymbols, newNumLocals); +} + +void FlatAffineValueConstraints::reset(unsigned newNumDims, + unsigned newNumSymbols, + unsigned newNumLocals, + ArrayRef idArgs) { reset(0, 0, newNumDims + newNumSymbols + newNumLocals + 1, newNumDims, newNumSymbols, newNumLocals, idArgs); } @@ -227,17 +263,23 @@ void FlatAffineConstraints::addLocalId(unsigned pos) { addId(IdKind::Local, pos); } -void FlatAffineConstraints::addDimId(unsigned pos, Value id) { +void FlatAffineConstraints::addDimId(unsigned pos) { + addId(IdKind::Dimension, pos); +} + +void FlatAffineValueConstraints::addDimId(unsigned pos, Value id) { addId(IdKind::Dimension, pos, id); } -void FlatAffineConstraints::addSymbolId(unsigned pos, Value id) { +void FlatAffineConstraints::addSymbolId(unsigned pos) { + addId(IdKind::Symbol, pos); +} + +void FlatAffineValueConstraints::addSymbolId(unsigned pos, Value id) { addId(IdKind::Symbol, pos, id); } -/// Adds a dimensional identifier. The added column is initialized to -/// zero. -void FlatAffineConstraints::addId(IdKind kind, unsigned pos, Value id) { +unsigned FlatAffineConstraints::addId(IdKind kind, unsigned pos) { if (kind == IdKind::Dimension) assert(pos <= getNumDimIds()); else if (kind == IdKind::Symbol) @@ -245,7 +287,7 @@ void FlatAffineConstraints::addId(IdKind kind, unsigned pos, Value id) { else assert(pos <= getNumLocalIds()); - int absolutePos; + unsigned absolutePos; if (kind == IdKind::Dimension) { absolutePos = pos; numDims++; @@ -260,18 +302,36 @@ void FlatAffineConstraints::addId(IdKind kind, unsigned pos, Value id) { inequalities.insertColumn(absolutePos); equalities.insertColumn(absolutePos); + return absolutePos; +} + +unsigned FlatAffineValueConstraints::addId(IdKind kind, unsigned pos) { + return addId(kind, pos, /*id=*/{}); +} + +unsigned FlatAffineValueConstraints::addId(IdKind kind, unsigned pos, + Value id) { + unsigned absolutePos = FlatAffineConstraints::addId(kind, pos); + // If an 'id' is provided, insert it; otherwise use None. if (id) ids.insert(ids.begin() + absolutePos, id); else ids.insert(ids.begin() + absolutePos, None); assert(ids.size() == getNumIds()); + + return absolutePos; +} + +bool FlatAffineValueConstraints::hasIdValues() const { + return llvm::find_if(ids, [](Optional id) { return id.hasValue(); }) != + ids.end(); } /// Checks if two constraint systems are in the same space, i.e., if they are /// associated with the same set of identifiers, appearing in the same order. -static bool areIdsAligned(const FlatAffineConstraints &a, - const FlatAffineConstraints &b) { +static bool areIdsAligned(const FlatAffineValueConstraints &a, + const FlatAffineValueConstraints &b) { return a.getNumDimIds() == b.getNumDimIds() && a.getNumSymbolIds() == b.getNumSymbolIds() && a.getNumIds() == b.getNumIds() && a.getIds().equals(b.getIds()); @@ -279,14 +339,14 @@ static bool areIdsAligned(const FlatAffineConstraints &a, /// Calls areIdsAligned to check if two constraint systems have the same set /// of identifiers in the same order. -bool FlatAffineConstraints::areIdsAlignedWithOther( - const FlatAffineConstraints &other) { +bool FlatAffineValueConstraints::areIdsAlignedWithOther( + const FlatAffineValueConstraints &other) { return areIdsAligned(*this, other); } /// Checks if the SSA values associated with `cst''s identifiers are unique. static bool LLVM_ATTRIBUTE_UNUSED -areIdsUnique(const FlatAffineConstraints &cst) { +areIdsUnique(const FlatAffineValueConstraints &cst) { SmallPtrSet uniqueIds; for (auto id : cst.getIds()) { if (id.hasValue() && !uniqueIds.insert(id.getValue()).second) @@ -304,9 +364,8 @@ areIdsUnique(const FlatAffineConstraints &cst) { /// and are placed one after other (A's followed by B's). // Eg: Input: A has ((%i %j) [%M %N]) and B has (%k, %j) [%P, %N, %M]) // Output: both A, B have (%i, %j, %k) [%M, %N, %P] -// -static void mergeAndAlignIds(unsigned offset, FlatAffineConstraints *a, - FlatAffineConstraints *b) { +static void mergeAndAlignIds(unsigned offset, FlatAffineValueConstraints *a, + FlatAffineValueConstraints *b) { assert(offset <= a->getNumDimIds() && offset <= b->getNumDimIds()); // A merge/align isn't meaningful if a cst's ids aren't distinct. assert(areIdsUnique(*a) && "A's id values aren't unique"); @@ -382,12 +441,13 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineConstraints *a, } // Call 'mergeAndAlignIds' to align constraint systems of 'this' and 'other'. -void FlatAffineConstraints::mergeAndAlignIdsWithOther( - unsigned offset, FlatAffineConstraints *other) { +void FlatAffineValueConstraints::mergeAndAlignIdsWithOther( + unsigned offset, FlatAffineValueConstraints *other) { mergeAndAlignIds(offset, this, other); } -LogicalResult FlatAffineConstraints::composeMap(const AffineValueMap *vMap) { +LogicalResult +FlatAffineValueConstraints::composeMap(const AffineValueMap *vMap) { return composeMatchingMap( computeAlignedMap(vMap->getAffineMap(), vMap->getOperands())); } @@ -446,7 +506,7 @@ LogicalResult FlatAffineConstraints::composeMatchingMap(AffineMap other) { } // Turn a symbol into a dimension. -static void turnSymbolIntoDim(FlatAffineConstraints *cst, Value id) { +static void turnSymbolIntoDim(FlatAffineValueConstraints *cst, Value id) { unsigned pos; if (cst->findId(id, &pos) && pos >= cst->getNumDimIds() && pos < cst->getNumDimAndSymbolIds()) { @@ -456,7 +516,7 @@ static void turnSymbolIntoDim(FlatAffineConstraints *cst, Value id) { } // Changes all symbol identifiers which are loop IVs to dim identifiers. -void FlatAffineConstraints::convertLoopIVSymbolsToDims() { +void FlatAffineValueConstraints::convertLoopIVSymbolsToDims() { // Gather all symbols which are loop IVs. SmallVector loopIVs; for (unsigned i = getNumDimIds(), e = getNumDimAndSymbolIds(); i < e; i++) { @@ -469,7 +529,7 @@ void FlatAffineConstraints::convertLoopIVSymbolsToDims() { } } -void FlatAffineConstraints::addInductionVarOrTerminalSymbol(Value id) { +void FlatAffineValueConstraints::addInductionVarOrTerminalSymbol(Value id) { if (containsId(id)) return; @@ -491,7 +551,8 @@ void FlatAffineConstraints::addInductionVarOrTerminalSymbol(Value id) { setIdToConstant(id, constOp.getValue()); } -LogicalResult FlatAffineConstraints::addAffineForOpDomain(AffineForOp forOp) { +LogicalResult +FlatAffineValueConstraints::addAffineForOpDomain(AffineForOp forOp) { unsigned pos; // Pre-condition for this method. if (!findId(forOp.getInductionVar(), &pos)) { @@ -556,9 +617,9 @@ LogicalResult FlatAffineConstraints::addAffineForOpDomain(AffineForOp forOp) { /// assumes the position of the dim identifiers in the constraint system is /// the same as the position of the loop in the loop nest. LogicalResult -FlatAffineConstraints::addDomainFromSliceMaps(ArrayRef lbMaps, - ArrayRef ubMaps, - ArrayRef operands) { +FlatAffineValueConstraints::addDomainFromSliceMaps(ArrayRef lbMaps, + ArrayRef ubMaps, + ArrayRef operands) { assert(lbMaps.size() == ubMaps.size()); assert(lbMaps.size() <= getNumDimIds()); @@ -608,9 +669,9 @@ FlatAffineConstraints::addDomainFromSliceMaps(ArrayRef lbMaps, return success(); } -void FlatAffineConstraints::addAffineIfOpDomain(AffineIfOp ifOp) { +void FlatAffineValueConstraints::addAffineIfOpDomain(AffineIfOp ifOp) { // Create the base constraints from the integer set attached to ifOp. - FlatAffineConstraints cst(ifOp.getIntegerSet()); + FlatAffineValueConstraints cst(ifOp.getIntegerSet()); // Bind ids in the constraints to ifOp operands. SmallVector operands = ifOp.getOperands(); @@ -679,8 +740,6 @@ bool FlatAffineConstraints::hasConsistentState() const { return false; if (!equalities.hasConsistentState()) return false; - if (ids.size() != getNumIds()) - return false; // Catches errors where numDims, numSymbols, numIds aren't consistent. if (numDims > numIds || numSymbols > numIds || numDims + numSymbols > numIds) @@ -689,6 +748,11 @@ bool FlatAffineConstraints::hasConsistentState() const { return true; } +bool FlatAffineValueConstraints::hasConsistentState() const { + return FlatAffineConstraints::hasConsistentState() && + ids.size() == getNumIds(); +} + /// Checks all rows of equality/inequality constraints for trivial /// contradictions (for example: 1 == 0, 0 >= 1), which may have surfaced /// after elimination. Returns 'true' if an invalid constraint is found; @@ -793,7 +857,11 @@ void FlatAffineConstraints::removeIdRange(unsigned idStart, unsigned idLimit) { numDims -= numDimsEliminated; numSymbols -= numSymbolsEliminated; numIds = numIds - numColsEliminated; +} +void FlatAffineValueConstraints::removeIdRange(unsigned idStart, + unsigned idLimit) { + FlatAffineConstraints::removeIdRange(idStart, idLimit); ids.erase(ids.begin() + idStart, ids.begin() + idLimit); } @@ -1927,8 +1995,9 @@ LogicalResult FlatAffineConstraints::addLowerOrUpperBound(unsigned pos, return success(); } -AffineMap FlatAffineConstraints::computeAlignedMap(AffineMap map, - ValueRange operands) const { +AffineMap +FlatAffineValueConstraints::computeAlignedMap(AffineMap map, + ValueRange operands) const { assert(map.getNumInputs() == operands.size() && "number of inputs mismatch"); SmallVector dims, syms; @@ -1955,10 +2024,9 @@ AffineMap FlatAffineConstraints::computeAlignedMap(AffineMap map, return alignedMap; } -LogicalResult -FlatAffineConstraints::addLowerOrUpperBound(unsigned pos, AffineMap boundMap, - ValueRange boundOperands, bool eq, - bool lower) { +LogicalResult FlatAffineValueConstraints::addLowerOrUpperBound( + unsigned pos, AffineMap boundMap, ValueRange boundOperands, bool eq, + bool lower) { // Fully compose map and operands; canonicalize and simplify so that we // transitively get to terminal symbols or loop IVs. auto map = boundMap; @@ -1980,10 +2048,9 @@ FlatAffineConstraints::addLowerOrUpperBound(unsigned pos, AffineMap boundMap, // Note that both lower/upper bounds use operands from 'operands'. // Returns failure for unimplemented cases such as semi-affine expressions or // expressions with mod/floordiv. -LogicalResult FlatAffineConstraints::addSliceBounds(ArrayRef values, - ArrayRef lbMaps, - ArrayRef ubMaps, - ArrayRef operands) { +LogicalResult FlatAffineValueConstraints::addSliceBounds( + ArrayRef values, ArrayRef lbMaps, + ArrayRef ubMaps, ArrayRef operands) { assert(values.size() == lbMaps.size()); assert(lbMaps.size() == ubMaps.size()); @@ -2099,7 +2166,7 @@ void FlatAffineConstraints::addLocalFloorDiv(ArrayRef dividend, addInequality(bound); } -bool FlatAffineConstraints::findId(Value id, unsigned *pos) const { +bool FlatAffineValueConstraints::findId(Value id, unsigned *pos) const { unsigned i = 0; for (const auto &mayBeId : ids) { if (mayBeId.hasValue() && mayBeId.getValue() == id) { @@ -2111,7 +2178,7 @@ bool FlatAffineConstraints::findId(Value id, unsigned *pos) const { return false; } -bool FlatAffineConstraints::containsId(Value id) const { +bool FlatAffineValueConstraints::containsId(Value id) const { return llvm::any_of(ids, [&](const Optional &mayBeId) { return mayBeId.hasValue() && mayBeId.getValue() == id; }); @@ -2128,6 +2195,10 @@ void FlatAffineConstraints::swapId(unsigned posA, unsigned posB) { std::swap(atIneq(r, posA), atIneq(r, posB)); for (unsigned r = 0, e = getNumEqualities(); r < e; r++) std::swap(atEq(r, posA), atEq(r, posB)); +} + +void FlatAffineValueConstraints::swapId(unsigned posA, unsigned posB) { + FlatAffineConstraints::swapId(posA, posB); std::swap(getId(posA), getId(posB)); } @@ -2148,7 +2219,7 @@ void FlatAffineConstraints::setIdToConstant(unsigned pos, int64_t val) { /// Sets the specified identifier to a constant value; asserts if the id is not /// found. -void FlatAffineConstraints::setIdToConstant(Value id, int64_t val) { +void FlatAffineValueConstraints::setIdToConstant(Value id, int64_t val) { unsigned pos; if (!findId(id, &pos)) // This is a pre-condition for this method. @@ -2475,10 +2546,14 @@ void FlatAffineConstraints::print(raw_ostream &os) const { << " constraints)\n"; os << "("; for (unsigned i = 0, e = getNumIds(); i < e; i++) { - if (ids[i] == None) + if (auto *valueCstr = dyn_cast(this)) { + if (valueCstr->hasIdValue(i)) + os << "Value "; + else + os << "None "; + } else { os << "None "; - else - os << "Value "; + } } os << " const)\n"; for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) { @@ -2571,9 +2646,23 @@ void FlatAffineConstraints::removeTrivialRedundancy() { void FlatAffineConstraints::clearAndCopyFrom( const FlatAffineConstraints &other) { - FlatAffineConstraints copy(other); - std::swap(*this, copy); - assert(copy.getNumIds() == copy.getIds().size()); + if (auto *otherValueSet = dyn_cast(&other)) + assert(!otherValueSet->hasIdValues() && + "cannot copy associated Values into FlatAffineConstraints"); + // Note: Assigment operator does not vtable pointer, so kind does not change. + *this = other; +} + +void FlatAffineValueConstraints::clearAndCopyFrom( + const FlatAffineConstraints &other) { + if (auto *otherValueSet = + dyn_cast(&other)) { + *this = *otherValueSet; + } else { + *static_cast(this) = other; + ids.clear(); + ids.resize(numIds, None); + } } void FlatAffineConstraints::removeId(unsigned pos) { @@ -2712,18 +2801,11 @@ void FlatAffineConstraints::fourierMotzkinEliminate( unsigned newNumDims = dimsSymbols.first; unsigned newNumSymbols = dimsSymbols.second; - SmallVector, 8> newIds; - newIds.reserve(numIds - 1); - newIds.append(ids.begin(), ids.begin() + pos); - newIds.append(ids.begin() + pos + 1, ids.end()); - /// Create the new system which has one identifier less. FlatAffineConstraints newFac( lbIndices.size() * ubIndices.size() + nbIndices.size(), getNumEqualities(), getNumCols() - 1, newNumDims, newNumSymbols, - /*numLocals=*/getNumIds() - 1 - newNumDims - newNumSymbols, newIds); - - assert(newFac.getIds().size() == newFac.getNumIds()); + /*numLocals=*/getNumIds() - 1 - newNumDims - newNumSymbols); // This will be used to check if the elimination was integer exact. unsigned lcmProducts = 1; @@ -2813,6 +2895,19 @@ void FlatAffineConstraints::fourierMotzkinEliminate( #undef DEBUG_TYPE #define DEBUG_TYPE "affine-structures" +void FlatAffineValueConstraints::fourierMotzkinEliminate( + unsigned pos, bool darkShadow, bool *isResultIntegerExact) { + SmallVector, 8> newIds; + newIds.reserve(numIds - 1); + newIds.append(ids.begin(), ids.begin() + pos); + newIds.append(ids.begin() + pos + 1, ids.end()); + // Note: Base implementation discards all associated Values. + FlatAffineConstraints::fourierMotzkinEliminate(pos, darkShadow, + isResultIntegerExact); + ids = newIds; + assert(getIds().size() == getNumIds()); +} + void FlatAffineConstraints::projectOut(unsigned pos, unsigned num) { if (num == 0) return; @@ -2848,7 +2943,7 @@ void FlatAffineConstraints::projectOut(unsigned pos, unsigned num) { normalizeConstraintsByGCD(); } -void FlatAffineConstraints::projectOut(Value id) { +void FlatAffineValueConstraints::projectOut(Value id) { unsigned pos; bool ret = findId(id, &pos); assert(ret); @@ -2913,26 +3008,13 @@ static void getCommonConstraints(const FlatAffineConstraints &a, LogicalResult FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) { assert(otherCst.getNumDimIds() == numDims && "dims mismatch"); - assert(otherCst.getIds() - .slice(0, getNumDimIds()) - .equals(getIds().slice(0, getNumDimIds())) && - "dim values mismatch"); assert(otherCst.getNumLocalIds() == 0 && "local ids not supported here"); assert(getNumLocalIds() == 0 && "local ids not supported yet here"); - // Align `other` to this. - Optional otherCopy; - if (!areIdsAligned(*this, otherCst)) { - otherCopy.emplace(FlatAffineConstraints(otherCst)); - mergeAndAlignIds(/*offset=*/numDims, this, &otherCopy.getValue()); - } - - const auto &otherAligned = otherCopy ? *otherCopy : otherCst; - // Get the constraints common to both systems; these will be added as is to // the union. FlatAffineConstraints commonCst; - getCommonConstraints(*this, otherAligned, commonCst); + getCommonConstraints(*this, otherCst, commonCst); std::vector> boundingLbs; std::vector> boundingUbs; @@ -2955,7 +3037,7 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) { // TODO: handle union if a dimension is unbounded. return failure(); - auto otherExtent = otherAligned.getConstantBoundOnDimSize( + auto otherExtent = otherCst.getConstantBoundOnDimSize( d, &otherLb, &otherLbFloorDivisor, &otherUb); if (!otherExtent.hasValue() || lbFloorDivisor != otherLbFloorDivisor) // TODO: symbolic extents when necessary. @@ -2977,7 +3059,7 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) { } else { // Uncomparable - check for constant lower/upper bounds. auto constLb = getConstantLowerBound(d); - auto constOtherLb = otherAligned.getConstantLowerBound(d); + auto constOtherLb = otherCst.getConstantLowerBound(d); if (!constLb.hasValue() || !constOtherLb.hasValue()) return failure(); std::fill(minLb.begin(), minLb.end(), 0); @@ -2993,7 +3075,7 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) { } else { // Uncomparable - check for constant lower/upper bounds. auto constUb = getConstantUpperBound(d); - auto constOtherUb = otherAligned.getConstantUpperBound(d); + auto constOtherUb = otherCst.getConstantUpperBound(d); if (!constUb.hasValue() || !constOtherUb.hasValue()) return failure(); std::fill(maxUb.begin(), maxUb.end(), 0); @@ -3035,6 +3117,26 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) { return success(); } +LogicalResult FlatAffineValueConstraints::unionBoundingBox( + const FlatAffineValueConstraints &otherCst) { + assert(otherCst.getNumDimIds() == numDims && "dims mismatch"); + assert(otherCst.getIds() + .slice(0, getNumDimIds()) + .equals(getIds().slice(0, getNumDimIds())) && + "dim values mismatch"); + assert(otherCst.getNumLocalIds() == 0 && "local ids not supported here"); + assert(getNumLocalIds() == 0 && "local ids not supported yet here"); + + // Align `other` to this. + if (!areIdsAligned(*this, otherCst)) { + FlatAffineValueConstraints otherCopy(otherCst); + mergeAndAlignIds(/*offset=*/numDims, this, &otherCopy); + return FlatAffineConstraints::unionBoundingBox(otherCopy); + } + + return FlatAffineConstraints::unionBoundingBox(otherCst); +} + /// Compute an explicit representation for local vars. For all systems coming /// from MLIR integer sets, maps, or expressions where local vars were /// introduced to model floordivs and mods, this always succeeds. @@ -3068,7 +3170,7 @@ static LogicalResult computeLocalVars(const FlatAffineConstraints &cst, llvm::all_of(localExprs, [](AffineExpr expr) { return expr; })); } -void FlatAffineConstraints::getIneqAsAffineValueMap( +void FlatAffineValueConstraints::getIneqAsAffineValueMap( unsigned pos, unsigned ineqPos, AffineValueMap &vmap, MLIRContext *context) const { unsigned numDims = getNumDimIds(); diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp index 262a329ab0de3..328a5f4967664 100644 --- a/mlir/lib/Analysis/Utils.cpp +++ b/mlir/lib/Analysis/Utils.cpp @@ -62,10 +62,10 @@ void mlir::getEnclosingAffineForAndIfOps(Operation &op, std::reverse(ops->begin(), ops->end()); } -// Populates 'cst' with FlatAffineConstraints which represent original domain of -// the loop bounds that define 'ivs'. +// Populates 'cst' with FlatAffineValueConstraints which represent original +// domain of the loop bounds that define 'ivs'. LogicalResult -ComputationSliceState::getSourceAsConstraints(FlatAffineConstraints &cst) { +ComputationSliceState::getSourceAsConstraints(FlatAffineValueConstraints &cst) { assert(!ivs.empty() && "Cannot have a slice without its IVs"); cst.reset(/*numDims=*/ivs.size(), /*numSymbols=*/0, /*numLocals=*/0, ivs); for (Value iv : ivs) { @@ -77,9 +77,9 @@ ComputationSliceState::getSourceAsConstraints(FlatAffineConstraints &cst) { return success(); } -// Populates 'cst' with FlatAffineConstraints which represent slice bounds. +// Populates 'cst' with FlatAffineValueConstraints which represent slice bounds. LogicalResult -ComputationSliceState::getAsConstraints(FlatAffineConstraints *cst) { +ComputationSliceState::getAsConstraints(FlatAffineValueConstraints *cst) { assert(!lbOperands.empty()); // Adds src 'ivs' as dimension identifiers in 'cst'. unsigned numDims = ivs.size(); @@ -232,7 +232,7 @@ Optional ComputationSliceState::isSliceValid() { return true; // Create constraints for the source loop nest using which slice is computed. - FlatAffineConstraints srcConstraints; + FlatAffineValueConstraints srcConstraints; // TODO: Store the source's domain to avoid computation at each depth. if (failed(getSourceAsConstraints(srcConstraints))) { LLVM_DEBUG(llvm::dbgs() << "Unable to compute source's domain\n"); @@ -254,7 +254,7 @@ Optional ComputationSliceState::isSliceValid() { // Create constraints for the slice loop nest that would be created if the // fusion succeeds. - FlatAffineConstraints sliceConstraints; + FlatAffineValueConstraints sliceConstraints; if (failed(getAsConstraints(&sliceConstraints))) { LLVM_DEBUG(llvm::dbgs() << "Unable to compute slice's domain\n"); return llvm::None; @@ -294,7 +294,7 @@ Optional ComputationSliceState::isMaximal() const { return isMaximalFastCheck; // Create constraints for the src loop nest being sliced. - FlatAffineConstraints srcConstraints; + FlatAffineValueConstraints srcConstraints; srcConstraints.reset(/*numDims=*/ivs.size(), /*numSymbols=*/0, /*numLocals=*/0, ivs); for (Value iv : ivs) { @@ -316,7 +316,7 @@ Optional ComputationSliceState::isMaximal() const { for (int i = consumerIVs.size(), end = ivs.size(); i < end; ++i) consumerIVs.push_back(Value()); - FlatAffineConstraints sliceConstraints; + FlatAffineValueConstraints sliceConstraints; sliceConstraints.reset(/*numDims=*/consumerIVs.size(), /*numSymbols=*/0, /*numLocals=*/0, consumerIVs); @@ -760,7 +760,7 @@ static Operation *getInstAtPosition(ArrayRef positions, // Adds loop IV bounds to 'cst' for loop IVs not found in 'ivs'. static LogicalResult addMissingLoopIVBounds(SmallPtrSet &ivs, - FlatAffineConstraints *cst) { + FlatAffineValueConstraints *cst) { for (unsigned i = 0, e = cst->getNumDimIds(); i < e; ++i) { auto value = cst->getIdValue(i); if (ivs.count(value) == 0) { @@ -813,7 +813,7 @@ mlir::computeSliceUnion(ArrayRef opsA, ArrayRef opsB, ComputationSliceState *sliceUnion) { // Compute the union of slice bounds between all pairs in 'opsA' and // 'opsB' in 'sliceUnionCst'. - FlatAffineConstraints sliceUnionCst; + FlatAffineValueConstraints sliceUnionCst; assert(sliceUnionCst.getNumDimAndSymbolIds() == 0); std::vector> dependentOpPairs; for (unsigned i = 0, numOpsA = opsA.size(); i < numOpsA; ++i) { @@ -831,7 +831,7 @@ mlir::computeSliceUnion(ArrayRef opsA, ArrayRef opsB, bool readReadAccesses = isa(srcAccess.opInst) && isa(dstAccess.opInst); - FlatAffineConstraints dependenceConstraints; + FlatAffineValueConstraints dependenceConstraints; // Check dependence between 'srcAccess' and 'dstAccess'. DependenceResult result = checkMemrefAccessDependence( srcAccess, dstAccess, /*loopDepth=*/numCommonLoops + 1, @@ -863,7 +863,7 @@ mlir::computeSliceUnion(ArrayRef opsA, ArrayRef opsB, } // Compute constraints for 'tmpSliceState' in 'tmpSliceCst'. - FlatAffineConstraints tmpSliceCst; + FlatAffineValueConstraints tmpSliceCst; if (failed(tmpSliceState.getAsConstraints(&tmpSliceCst))) { LLVM_DEBUG(llvm::dbgs() << "Unable to compute slice bound constraints\n"); @@ -1044,7 +1044,7 @@ const char *const kSliceFusionBarrierAttrName = "slice_fusion_barrier"; // the other loop nest's IVs, symbols and constants (using 'isBackwardsSlice'). void mlir::getComputationSliceState( Operation *depSourceOp, Operation *depSinkOp, - FlatAffineConstraints *dependenceConstraints, unsigned loopDepth, + FlatAffineValueConstraints *dependenceConstraints, unsigned loopDepth, bool isBackwardSlice, ComputationSliceState *sliceState) { // Get loop nest surrounding src operation. SmallVector srcLoopIVs; diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp index b6cce790f715d..e1b635a110a2b 100644 --- a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp @@ -139,7 +139,7 @@ bool hasNoInterveningEffect(Operation *start, T memOp) { // Dependence analysis is only correct if both ops operate on the same // memref. if (srcAccess.memref == destAccess.memref) { - FlatAffineConstraints dependenceConstraints; + FlatAffineValueConstraints dependenceConstraints; // Number of loops containing the start op and the ending operation. unsigned minSurroundingLoops = diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp index 955230d2068f3..7026ad0166369 100644 --- a/mlir/lib/Transforms/LoopFusion.cpp +++ b/mlir/lib/Transforms/LoopFusion.cpp @@ -916,7 +916,7 @@ static Value createPrivateMemRef(AffineForOp forOp, Operation *srcStoreOpInst, assert(numElements.hasValue() && "non-constant number of elts in local buffer"); - const FlatAffineConstraints *cst = region.getConstraints(); + const FlatAffineValueConstraints *cst = region.getConstraints(); // 'outerIVs' holds the values that this memory region is symbolic/parametric // on; this would correspond to loop IVs surrounding the level at which the // slice is being materialized. diff --git a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp index 29ee0cffe73d2..6b7f9369cbc29 100644 --- a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp @@ -235,7 +235,7 @@ static unsigned getMaxLoopDepth(ArrayRef srcOps, unsigned numCommonLoops = getNumCommonSurroundingLoops(*srcOpInst, *dstOpInst); for (unsigned d = 1; d <= numCommonLoops + 1; ++d) { - FlatAffineConstraints dependenceConstraints; + FlatAffineValueConstraints dependenceConstraints; // TODO: Cache dependence analysis results, check cache here. DependenceResult result = checkMemrefAccessDependence( srcAccess, dstAccess, d, &dependenceConstraints, diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp index 15aea84c57dbc..fcca1ae0bd0f9 100644 --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -459,7 +459,7 @@ checkTilingLegalityImpl(MutableArrayRef origLoops) { unsigned numOps = loadAndStoreOps.size(); unsigned numLoops = origLoops.size(); - FlatAffineConstraints dependenceConstraints; + FlatAffineValueConstraints dependenceConstraints; for (unsigned d = 1; d <= numLoops + 1; ++d) { for (unsigned i = 0; i < numOps; ++i) { Operation *srcOp = loadAndStoreOps[i]; @@ -596,7 +596,7 @@ void constructTiledLoopNest(MutableArrayRef origLoops, LogicalResult checkIfHyperRectangular(MutableArrayRef input, AffineForOp rootAffineForOp, unsigned width) { - FlatAffineConstraints cst; + FlatAffineValueConstraints cst; SmallVector ops(input.begin(), input.end()); (void)getIndexSet(ops, &cst); if (!cst.isHyperRectangular(0, width)) { @@ -2440,7 +2440,7 @@ static LogicalResult generateCopy( for (unsigned i = 0; i < rank; ++i) region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]); - const FlatAffineConstraints *cst = region.getConstraints(); + const FlatAffineValueConstraints *cst = region.getConstraints(); // 'regionSymbols' hold values that this memory region is symbolic/parametric // on; these typically include loop IVs surrounding the level at which the // copy generation is being done or other valid symbols in MLIR. @@ -3001,7 +3001,7 @@ static AffineIfOp createSeparationCondition(MutableArrayRef loops, auto *context = loops[0].getContext(); - FlatAffineConstraints cst; + FlatAffineValueConstraints cst; SmallVector ops; ops.reserve(loops.size()); for (AffineForOp forOp : loops) @@ -3082,7 +3082,7 @@ createFullTiles(MutableArrayRef inputNest, // For each loop in the original nest identify a lower/upper bound pair such // that their difference is a constant. - FlatAffineConstraints cst; + FlatAffineValueConstraints cst; for (auto loop : inputNest) { // TODO: straightforward to generalize to a non-unit stride. if (loop.getStep() != 1) { diff --git a/mlir/test/lib/Analysis/TestMemRefDependenceCheck.cpp b/mlir/test/lib/Analysis/TestMemRefDependenceCheck.cpp index b53092cfc1f47..3be9dc14b7520 100644 --- a/mlir/test/lib/Analysis/TestMemRefDependenceCheck.cpp +++ b/mlir/test/lib/Analysis/TestMemRefDependenceCheck.cpp @@ -81,7 +81,7 @@ static void checkDependences(ArrayRef loadsAndStores) { unsigned numCommonLoops = getNumCommonSurroundingLoops(*srcOpInst, *dstOpInst); for (unsigned d = 1; d <= numCommonLoops + 1; ++d) { - FlatAffineConstraints dependenceConstraints; + FlatAffineValueConstraints dependenceConstraints; SmallVector dependenceComponents; DependenceResult result = checkMemrefAccessDependence( srcAccess, dstAccess, d, &dependenceConstraints, From c411c1bd7f7d3550d24333f80980c0be6481d34a Mon Sep 17 00:00:00 2001 From: Weverything Date: Mon, 16 Aug 2021 16:54:10 -0700 Subject: [PATCH 172/700] Fix missing qualifier in template type diffing Handle SubstTemplateTypeParmType so qualifiers do not get dropped from the diagnostic message. --- clang/lib/AST/ASTDiagnostic.cpp | 3 ++ clang/test/Misc/diag-template-diffing.cpp | 37 +++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp index dc22481d0a84c..7e435e8b35b80 100644 --- a/clang/lib/AST/ASTDiagnostic.cpp +++ b/clang/lib/AST/ASTDiagnostic.cpp @@ -1088,6 +1088,9 @@ class TemplateDiff { Ty->getAs()) return TST; + if (const auto* SubstType = Ty->getAs()) + Ty = SubstType->getReplacementType(); + const RecordType *RT = Ty->getAs(); if (!RT) diff --git a/clang/test/Misc/diag-template-diffing.cpp b/clang/test/Misc/diag-template-diffing.cpp index cc1cc9ca70679..6bf6e2de4277c 100644 --- a/clang/test/Misc/diag-template-diffing.cpp +++ b/clang/test/Misc/diag-template-diffing.cpp @@ -1488,6 +1488,43 @@ void run(A_reg reg, A_ptr ptr, A_ref ref) { } } +namespace SubstTemplateTypeParmType { +template +class Array { +}; + +template +class S{}; + +template +Array Make(T (¶meter)[num]); + +void Run(int, Array>) {} + +Array> Make(); +void Call() { + const S s1[5]; + S s2[5]; + + Run(0, Make(s1)); // Error + Run(0, Make(s2)); // Okay +} + +// CHECK-ELIDE-NOTREE: no matching function for call to 'Run' +// CHECK-ELIDE-NOTREE: no known conversion from 'Array>' to 'Array>' for 2nd argument +// CHECK-NOELIDE-NOTREE: no matching function for call to 'Run' +// CHECK-NOELIDE-NOTREE: no known conversion from 'Array>' to 'Array>' for 2nd argument +// CHECK-ELIDE-TREE: no matching function for call to 'Run' +// CHECK-ELIDE-TREE: no known conversion from argument type to parameter type for 2nd argument +// CHECK-ELIDE-TREE: Array< +// CHECK-ELIDE-TREE: [const != (no qualifiers)] S<...>> +// CHECK-NOELIDE-TREE: no matching function for call to 'Run' +// CHECK-NOELIDE-TREE: no known conversion from argument type to parameter type for 2nd argument +// CHECK-NOELIDE-TREE: Array< +// CHECK-NOELIDE-TREE: [const != (no qualifiers)] S< +// CHECK-NOELIDE-TREE: int>> +} + // CHECK-ELIDE-NOTREE: {{[0-9]*}} errors generated. // CHECK-NOELIDE-NOTREE: {{[0-9]*}} errors generated. // CHECK-ELIDE-TREE: {{[0-9]*}} errors generated. From c19c51e357a2e15e391e547441291f8a2ff771f9 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Tue, 17 Aug 2021 10:27:41 +0900 Subject: [PATCH 173/700] [mlir][Analysis][NFC] Clean up FlatAffineValueConstraints * Rename ids to values in FlatAffineValueConstraints. * Overall cleanup of comments in FlatAffineConstraints and FlatAffineValueConstraints. Differential Revision: https://reviews.llvm.org/D107947 --- mlir/include/mlir/Analysis/AffineStructures.h | 241 +++++++++--------- mlir/lib/Analysis/AffineAnalysis.cpp | 46 ++-- mlir/lib/Analysis/AffineStructures.cpp | 190 +++++++------- mlir/lib/Analysis/Utils.cpp | 22 +- .../Dialect/Linalg/Transforms/Hoisting.cpp | 2 +- mlir/lib/Transforms/LoopFusion.cpp | 2 +- mlir/lib/Transforms/Utils/LoopUtils.cpp | 8 +- 7 files changed, 248 insertions(+), 263 deletions(-) diff --git a/mlir/include/mlir/Analysis/AffineStructures.h b/mlir/include/mlir/Analysis/AffineStructures.h index c97a2f6493eed..76f62355d9af8 100644 --- a/mlir/include/mlir/Analysis/AffineStructures.h +++ b/mlir/include/mlir/Analysis/AffineStructures.h @@ -116,7 +116,7 @@ class FlatAffineConstraints { void reset(unsigned numDims = 0, unsigned numSymbols = 0, unsigned numLocals = 0); - /// Appends constraints from 'other' into this. This is equivalent to an + /// Appends constraints from `other` into `this`. This is equivalent to an /// intersection with no simplification of any sort attempted. void append(const FlatAffineConstraints &other); @@ -127,14 +127,15 @@ class FlatAffineConstraints { /// Returns false otherwise. bool isEmpty() const; - /// Runs the GCD test on all equality constraints. Returns 'true' if this test - /// fails on any equality. Returns 'false' otherwise. + /// Runs the GCD test on all equality constraints. Returns true if this test + /// fails on any equality. Returns false otherwise. /// This test can be used to disprove the existence of a solution. If it /// returns true, no integer solution to the equality constraints can exist. bool isEmptyByGCDTest() const; /// Returns true if the set of constraints is found to have no solution, - /// false if a solution exists. Uses the same algorithm as findIntegerSample. + /// false if a solution exists. Uses the same algorithm as + /// `findIntegerSample`. bool isIntegerEmpty() const; /// Returns a matrix where each row is a vector along which the polytope is @@ -162,10 +163,10 @@ class FlatAffineConstraints { inline int64_t atEq(unsigned i, unsigned j) const { return equalities(i, j); } inline int64_t &atEq(unsigned i, unsigned j) { return equalities(i, j); } + /// Returns the value at the specified inequality row and column. inline int64_t atIneq(unsigned i, unsigned j) const { return inequalities(i, j); } - inline int64_t &atIneq(unsigned i, unsigned j) { return inequalities(i, j); } /// Returns the number of columns in the constraint system. @@ -208,8 +209,8 @@ class FlatAffineConstraints { /// being known and such a local variable appearing in any of the constraints. IntegerSet getAsIntegerSet(MLIRContext *context) const; - /// Computes the lower and upper bounds of the first 'num' dimensional - /// identifiers (starting at 'offset') as an affine map of the remaining + /// Computes the lower and upper bounds of the first `num` dimensional + /// identifiers (starting at `offset`) as an affine map of the remaining /// identifiers (dimensional and symbolic). This method is able to detect /// identifiers as floordiv's and mod's of affine expressions of other /// identifiers with respect to (positive) constants. Sets bound map to a @@ -218,9 +219,9 @@ class FlatAffineConstraints { SmallVectorImpl *lbMaps, SmallVectorImpl *ubMaps); - /// Adds an inequality (>= 0) from the coefficients specified in inEq. + /// Adds an inequality (>= 0) from the coefficients specified in `inEq`. void addInequality(ArrayRef inEq); - /// Adds an equality from the coefficients specified in eq. + /// Adds an equality from the coefficients specified in `eq`. void addEquality(ArrayRef eq); /// Adds a constant lower bound constraint for the specified identifier. @@ -229,8 +230,8 @@ class FlatAffineConstraints { void addConstantUpperBound(unsigned pos, int64_t ub); /// Adds a new local identifier as the floordiv of an affine function of other - /// identifiers, the coefficients of which are provided in 'dividend' and with - /// respect to a positive constant 'divisor'. Two constraints are added to the + /// identifiers, the coefficients of which are provided in `dividend` and with + /// respect to a positive constant `divisor`. Two constraints are added to the /// system to capture equivalence with the floordiv: /// q = dividend floordiv c <=> c*q <= dividend <= c*q + c - 1. void addLocalFloorDiv(ArrayRef dividend, int64_t divisor); @@ -260,8 +261,8 @@ class FlatAffineConstraints { /// system. Returns failure if `other` is a semi-affine map. LogicalResult composeMatchingMap(AffineMap other); - /// Projects out (aka eliminates) 'num' identifiers starting at position - /// 'pos'. The resulting constraint system is the shadow along the dimensions + /// Projects out (aka eliminates) `num` identifiers starting at position + /// `pos`. The resulting constraint system is the shadow along the dimensions /// that still exist. This method may not always be integer exact. // TODO: deal with integer exactness when necessary - can return a value to // mark exactness for example. @@ -274,8 +275,8 @@ class FlatAffineConstraints { void removeEquality(unsigned pos); void removeInequality(unsigned pos); - /// Sets the values.size() identifiers starting at pos to the specified values - /// and removes them. + /// Sets the `values.size()` identifiers starting at `po`s to the specified + /// values and removes them. void setAndEliminate(unsigned pos, ArrayRef values); /// Changes the partition between dimensions and symbols. Depending on the new @@ -289,23 +290,27 @@ class FlatAffineConstraints { /// system. LogicalResult constantFoldId(unsigned pos); - /// This method calls constantFoldId for the specified range of identifiers, - /// 'num' identifiers starting at position 'pos'. + /// This method calls `constantFoldId` for the specified range of identifiers, + /// `num` identifiers starting at position `pos`. void constantFoldIdRange(unsigned pos, unsigned num); /// Updates the constraints to be the smallest bounding (enclosing) box that - /// contains the points of 'this' set and that of 'other', with the symbols + /// contains the points of `this` set and that of `other`, with the symbols /// being treated specially. For each of the dimensions, the min of the lower /// bounds (symbolic) and the max of the upper bounds (symbolic) is computed - /// to determine such a bounding box. `other' is expected to have the same + /// to determine such a bounding box. `other` is expected to have the same /// dimensional identifiers as this constraint system (in the same order). /// - /// Eg: if 'this' is {0 <= d0 <= 127}, 'other' is {16 <= d0 <= 192}, the - /// output is {0 <= d0 <= 192}. - /// 2) 'this' = {s0 + 5 <= d0 <= s0 + 20}, 'other' is {s0 + 1 <= d0 <= s0 + - /// 9}, output = {s0 + 1 <= d0 <= s0 + 20}. - /// 3) 'this' = {0 <= d0 <= 5, 1 <= d1 <= 9}, 'other' = {2 <= d0 <= 6, 5 <= d1 - /// <= 15}, output = {0 <= d0 <= 6, 1 <= d1 <= 15}. + /// E.g.: + /// 1) this = {0 <= d0 <= 127}, + /// other = {16 <= d0 <= 192}, + /// output = {0 <= d0 <= 192} + /// 2) this = {s0 + 5 <= d0 <= s0 + 20}, + /// other = {s0 + 1 <= d0 <= s0 + 9}, + /// output = {s0 + 1 <= d0 <= s0 + 20} + /// 3) this = {0 <= d0 <= 5, 1 <= d1 <= 9} + /// other = {2 <= d0 <= 6, 5 <= d1 <= 15}, + /// output = {0 <= d0 <= 6, 1 <= d1 <= 15} LogicalResult unionBoundingBox(const FlatAffineConstraints &other); unsigned getNumConstraints() const { @@ -332,7 +337,7 @@ class FlatAffineConstraints { /// involving only the symbolic identifiers. `lb` and `ub` (along with the /// `boundFloorDivisor`) are set to represent the lower and upper bound /// associated with the constant difference: `lb`, `ub` have the coefficients, - /// and boundFloorDivisor, their divisor. `minLbPos` and `minUbPos` if + /// and `boundFloorDivisor`, their divisor. `minLbPos` and `minUbPos` if /// non-null are set to the position of the constant lower bound and upper /// bound respectively (to the same if they are from an equality). Ex: if the /// lower bound is [(s0 + s2 - 1) floordiv 32] for a system with three @@ -375,7 +380,7 @@ class FlatAffineConstraints { unsigned offset = 0, unsigned num = 0) const; /// Removes constraints that are independent of (i.e., do not have a - /// coefficient for) for identifiers in the range [pos, pos + num). + /// coefficient) identifiers in the range [pos, pos + num). void removeIndependentConstraints(unsigned pos, unsigned num); /// Returns true if the set can be trivially detected as being @@ -390,8 +395,8 @@ class FlatAffineConstraints { /// constraints by their GCD and performs GCD tightening on inequalities. void removeTrivialRedundancy(); - /// A more expensive check to detect redundant inequalities thatn - /// removeTrivialRedundancy. + /// A more expensive check than `removeTrivialRedundancy` to detect redundant + /// inequalities. void removeRedundantInequalities(); /// Removes redundant constraints using Simplex. Although the algorithm can @@ -415,8 +420,8 @@ class FlatAffineConstraints { /// Checks all rows of equality/inequality constraints for trivial /// contradictions (for example: 1 == 0, 0 >= 1), which may have surfaced - /// after elimination. Returns 'true' if an invalid constraint is found; - /// 'false'otherwise. + /// after elimination. Returns true if an invalid constraint is found; + /// false otherwise. bool hasInvalidConstraint() const; /// Returns the constant lower bound bound if isLower is true, and the upper @@ -437,9 +442,9 @@ class FlatAffineConstraints { LogicalResult flattenAlignedMapAndMergeLocals( AffineMap map, std::vector> *flattenedExprs); - /// Eliminates a single identifier at 'position' from equality and inequality - /// constraints. Returns 'success' if the identifier was eliminated, and - /// 'failure' otherwise. + /// Eliminates a single identifier at `position` from equality and inequality + /// constraints. Returns `success` if the identifier was eliminated, and + /// `failure` otherwise. inline LogicalResult gaussianEliminateId(unsigned position) { return success(gaussianEliminateIds(position, position + 1) == 1); } @@ -449,10 +454,10 @@ class FlatAffineConstraints { /// Returns the number of variables eliminated. unsigned gaussianEliminateIds(unsigned posStart, unsigned posLimit); - /// Eliminates identifier at the specified position using Fourier-Motzkin + /// Eliminates the identifier at the specified position using Fourier-Motzkin /// variable elimination, but uses Gaussian elimination if there is an /// equality involving that identifier. If the result of the elimination is - /// integer exact, *isResultIntegerExact is set to true. If 'darkShadow' is + /// integer exact, `*isResultIntegerExact` is set to true. If `darkShadow` is /// set to true, a potential under approximation (subset) of the rational /// shadow / exact integer shadow is computed. // See implementation comments for more details. @@ -514,28 +519,28 @@ class FlatAffineValueConstraints : public FlatAffineConstraints { unsigned numReservedEqualities, unsigned numReservedCols, unsigned numDims, unsigned numSymbols, unsigned numLocals, - ArrayRef> idArgs = {}) + ArrayRef> valArgs = {}) : FlatAffineConstraints(numReservedInequalities, numReservedEqualities, numReservedCols, numDims, numSymbols, numLocals) { assert(numReservedCols >= numIds + 1); - assert(idArgs.empty() || idArgs.size() == numIds); - ids.reserve(numReservedCols); - if (idArgs.empty()) - ids.resize(numIds, None); + assert(valArgs.empty() || valArgs.size() == numIds); + values.reserve(numReservedCols); + if (valArgs.empty()) + values.resize(numIds, None); else - ids.append(idArgs.begin(), idArgs.end()); + values.append(valArgs.begin(), valArgs.end()); } /// Constructs a constraint system with the specified number of /// dimensions and symbols. FlatAffineValueConstraints(unsigned numDims = 0, unsigned numSymbols = 0, unsigned numLocals = 0, - ArrayRef> idArgs = {}) + ArrayRef> valArgs = {}) : FlatAffineValueConstraints(/*numReservedInequalities=*/0, /*numReservedEqualities=*/0, /*numReservedCols=*/numDims + numSymbols + numLocals + 1, - numDims, numSymbols, numLocals, idArgs) {} + numDims, numSymbols, numLocals, valArgs) {} /// Create a flat affine constraint system from an AffineValueMap or a list of /// these. The constructed system will only include equalities. @@ -562,9 +567,9 @@ class FlatAffineValueConstraints : public FlatAffineConstraints { unsigned numLocals = 0) override; void reset(unsigned numReservedInequalities, unsigned numReservedEqualities, unsigned numReservedCols, unsigned numDims, unsigned numSymbols, - unsigned numLocals, ArrayRef idArgs); + unsigned numLocals, ArrayRef valArgs); void reset(unsigned numDims, unsigned numSymbols, unsigned numLocals, - ArrayRef idArgs); + ArrayRef valArgs); using FlatAffineConstraints::reset; /// Clones this object. @@ -572,7 +577,7 @@ class FlatAffineValueConstraints : public FlatAffineConstraints { /// Adds constraints (lower and upper bounds) for the specified 'affine.for' /// operation's Value using IR information stored in its bound maps. The - /// right identifier is first looked up using forOp's Value. Asserts if the + /// right identifier is first looked up using `forOp`'s Value. Asserts if the /// Value corresponding to the 'affine.for' operation isn't found in the /// constraint system. Returns failure for the yet unimplemented/unsupported /// cases. Any new identifiers that are found in the bound operands of the @@ -583,9 +588,9 @@ class FlatAffineValueConstraints : public FlatAffineConstraints { LogicalResult addAffineForOpDomain(AffineForOp forOp); /// Adds constraints (lower and upper bounds) for each loop in the loop nest - /// described by the bound maps 'lbMaps' and 'ubMaps' of a computation slice. - /// Every pair ('lbMaps[i]', 'ubMaps[i]') describes the bounds of a loop in - /// the nest, sorted outer-to-inner. 'operands' contains the bound operands + /// described by the bound maps `lbMaps` and `ubMaps` of a computation slice. + /// Every pair (`lbMaps[i]`, `ubMaps[i]`) describes the bounds of a loop in + /// the nest, sorted outer-to-inner. `operands` contains the bound operands /// for a single bound map. All the bound maps will use the same bound /// operands. Note that some loops described by a computation slice might not /// exist yet in the IR so the Value attached to those dimension identifiers @@ -624,52 +629,52 @@ class FlatAffineValueConstraints : public FlatAffineConstraints { AffineValueMap &vmap, MLIRContext *context) const; - /// Adds slice lower bounds represented by lower bounds in 'lbMaps' and upper - /// bounds in 'ubMaps' to each identifier in the constraint system which has - /// a value in 'values'. Note that both lower/upper bounds share the same - /// operand list 'operands'. - /// This function assumes 'values.size' == 'lbMaps.size' == 'ubMaps.size'. - /// Note that both lower/upper bounds use operands from 'operands'. + /// Adds slice lower bounds represented by lower bounds in `lbMaps` and upper + /// bounds in `ubMaps` to each identifier in the constraint system which has + /// a value in `values`. Note that both lower/upper bounds share the same + /// operand list `operands`. + /// This function assumes `values.size` == `lbMaps.size` == `ubMaps.size`. + /// Note that both lower/upper bounds use operands from `operands`. LogicalResult addSliceBounds(ArrayRef values, ArrayRef lbMaps, ArrayRef ubMaps, ArrayRef operands); - /// Sets the identifier corresponding to the specified Value id to a - /// constant. Asserts if the 'id' is not found. - void setIdToConstant(Value id, int64_t val); + /// Sets the identifier corresponding to the specified Value `value` to a + /// constant. Asserts if the `value` is not found. + void setIdToConstant(Value value, int64_t val); using FlatAffineConstraints::setIdToConstant; /// Looks up the position of the identifier with the specified Value. Returns - /// true if found (false otherwise). `pos' is set to the (column) position of + /// true if found (false otherwise). `pos` is set to the (column) position of /// the identifier. - bool findId(Value id, unsigned *pos) const; + bool findId(Value val, unsigned *pos) const; /// Returns true if an identifier with the specified Value exists, false /// otherwise. - bool containsId(Value id) const; + bool containsId(Value val) const; /// Swap the posA^th identifier with the posB^th identifier. void swapId(unsigned posA, unsigned posB) override; /// Add identifiers of the specified kind - specified positions are relative /// to the kind of identifier. The coefficient column corresponding to the - /// added identifier is initialized to zero. 'id' is the Value corresponding + /// added identifier is initialized to zero. `val` is the Value corresponding /// to the identifier that can optionally be provided. - void addDimId(unsigned pos, Value id); + void addDimId(unsigned pos, Value val); using FlatAffineConstraints::addDimId; - void addSymbolId(unsigned pos, Value id); + void addSymbolId(unsigned pos, Value val); using FlatAffineConstraints::addSymbolId; unsigned addId(IdKind kind, unsigned pos) override; - unsigned addId(IdKind kind, unsigned pos, Value id); + unsigned addId(IdKind kind, unsigned pos, Value val); /// Add the specified values as a dim or symbol id depending on its nature, if - /// it already doesn't exist in the system. `id' has to be either a terminal + /// it already doesn't exist in the system. `val` has to be either a terminal /// symbol or a loop IV, i.e., it cannot be the result affine.apply of any /// symbols or loop IVs. The identifier is added to the end of the existing /// dims or symbols. Additional information on the identifier is extracted /// from the IR and added to the constraint system. - void addInductionVarOrTerminalSymbol(Value id); + void addInductionVarOrTerminalSymbol(Value val); /// Align `map` with this constraint system based on `operands`. Each operand /// must already have a corresponding dim/symbol in this constraint system. @@ -688,104 +693,100 @@ class FlatAffineValueConstraints : public FlatAffineConstraints { LogicalResult composeMap(const AffineValueMap *vMap); /// Projects out the identifier that is associate with Value. - void projectOut(Value id); + void projectOut(Value val); using FlatAffineConstraints::projectOut; /// Changes all symbol identifiers which are loop IVs to dim identifiers. void convertLoopIVSymbolsToDims(); /// Updates the constraints to be the smallest bounding (enclosing) box that - /// contains the points of 'this' set and that of 'other', with the symbols + /// contains the points of `this` set and that of `other`, with the symbols /// being treated specially. For each of the dimensions, the min of the lower /// bounds (symbolic) and the max of the upper bounds (symbolic) is computed - /// to determine such a bounding box. `other' is expected to have the same + /// to determine such a bounding box. `other` is expected to have the same /// dimensional identifiers as this constraint system (in the same order). /// - /// Eg: if 'this' is {0 <= d0 <= 127}, 'other' is {16 <= d0 <= 192}, the - /// output is {0 <= d0 <= 192}. - /// 2) 'this' = {s0 + 5 <= d0 <= s0 + 20}, 'other' is {s0 + 1 <= d0 <= s0 + - /// 9}, output = {s0 + 1 <= d0 <= s0 + 20}. - /// 3) 'this' = {0 <= d0 <= 5, 1 <= d1 <= 9}, 'other' = {2 <= d0 <= 6, 5 <= d1 - /// <= 15}, output = {0 <= d0 <= 6, 1 <= d1 <= 15}. + /// E.g.: + /// 1) this = {0 <= d0 <= 127}, + /// other = {16 <= d0 <= 192}, + /// output = {0 <= d0 <= 192} + /// 2) this = {s0 + 5 <= d0 <= s0 + 20}, + /// other = {s0 + 1 <= d0 <= s0 + 9}, + /// output = {s0 + 1 <= d0 <= s0 + 20} + /// 3) this = {0 <= d0 <= 5, 1 <= d1 <= 9} + /// other = {2 <= d0 <= 6, 5 <= d1 <= 15}, + /// output = {0 <= d0 <= 6, 1 <= d1 <= 15} LogicalResult unionBoundingBox(const FlatAffineValueConstraints &other); using FlatAffineConstraints::unionBoundingBox; - /// Merge and align the identifiers of 'this' and 'other' starting at - /// 'offset', so that both constraint systems get the union of the contained + /// Merge and align the identifiers of `this` and `other` starting at + /// `offset`, so that both constraint systems get the union of the contained /// identifiers that is dimension-wise and symbol-wise unique; both /// constraint systems are updated so that they have the union of all - /// identifiers, with this's original identifiers appearing first followed by - /// any of other's identifiers that didn't appear in 'this'. Local + /// identifiers, with `this`'s original identifiers appearing first followed + /// by any of `other`'s identifiers that didn't appear in `this`. Local /// identifiers of each system are by design separate/local and are placed - /// one after other (this's followed by other's). - // Eg: Input: 'this' has ((%i %j) [%M %N]) - // 'other' has (%k, %j) [%P, %N, %M]) - // Output: both 'this', 'other' have (%i, %j, %k) [%M, %N, %P] + /// one after other (`this`'s followed by `other`'s). + // Eg: Input: `this` has (%i %j) [%M %N] + // `other` has (%k, %j) [%P, %N, %M] + // Output: both `this`, `other` have (%i, %j, %k) [%M, %N, %P] // void mergeAndAlignIdsWithOther(unsigned offset, FlatAffineValueConstraints *other); - /// Returns 'true' if this constraint system and 'other' are in the same + /// Returns true if this constraint system and `other` are in the same /// space, i.e., if they are associated with the same set of identifiers, - /// appearing in the same order. Returns 'false' otherwise. + /// appearing in the same order. Returns false otherwise. bool areIdsAlignedWithOther(const FlatAffineValueConstraints &other); /// Replaces the contents of this FlatAffineValueConstraints with `other`. void clearAndCopyFrom(const FlatAffineConstraints &other) override; - inline ArrayRef> getIds() const { - return {ids.data(), ids.size()}; - } - inline MutableArrayRef> getIds() { - return {ids.data(), ids.size()}; - } - - /// Returns the optional Value corresponding to the pos^th identifier. - inline Optional getId(unsigned pos) const { return ids[pos]; } - inline Optional &getId(unsigned pos) { return ids[pos]; } - /// Returns the Value associated with the pos^th identifier. Asserts if /// no Value identifier was associated. - inline Value getIdValue(unsigned pos) const { - assert(hasIdValue(pos) && "identifier's Value not set"); - return ids[pos].getValue(); + inline Value getValue(unsigned pos) const { + assert(hasValue(pos) && "identifier's Value not set"); + return values[pos].getValue(); } /// Returns true if the pos^th identifier has an associated Value. - inline bool hasIdValue(unsigned pos) const { return ids[pos].hasValue(); } + inline bool hasValue(unsigned pos) const { return values[pos].hasValue(); } /// Returns true if at least one identifier has an associated Value. - bool hasIdValues() const; + bool hasValues() const; /// Returns the Values associated with identifiers in range [start, end). /// Asserts if no Value was associated with one of these identifiers. - void getIdValues(unsigned start, unsigned end, - SmallVectorImpl *values) const { + inline void getValues(unsigned start, unsigned end, + SmallVectorImpl *values) const { assert((start < numIds || start == end) && "invalid start position"); assert(end <= numIds && "invalid end position"); values->clear(); values->reserve(end - start); - for (unsigned i = start; i < end; i++) { - values->push_back(getIdValue(i)); - } + for (unsigned i = start; i < end; i++) + values->push_back(getValue(i)); + } + inline void getAllValues(SmallVectorImpl *values) const { + getValues(0, numIds, values); } - inline void getAllIdValues(SmallVectorImpl *values) const { - getIdValues(0, numIds, values); + + inline ArrayRef> getMaybeValues() const { + return {values.data(), values.size()}; } - /// Sets Value associated with the pos^th identifier. - inline void setIdValue(unsigned pos, Value val) { + /// Sets the Value associated with the pos^th identifier. + inline void setValue(unsigned pos, Value val) { assert(pos < numIds && "invalid id position"); - ids[pos] = val; + values[pos] = val; } - /// Sets Values associated with identifiers in the range [start, end). - void setIdValues(unsigned start, unsigned end, ArrayRef values) { + /// Sets the Values associated with the identifiers in the range [start, end). + void setValues(unsigned start, unsigned end, ArrayRef values) { assert((start < numIds || end == start) && "invalid start position"); assert(end <= numIds && "invalid end position"); assert(values.size() == end - start); for (unsigned i = start; i < end; ++i) - ids[i] = values[i - start]; + setValue(i, values[i - start]); } protected: @@ -799,10 +800,10 @@ class FlatAffineValueConstraints : public FlatAffineConstraints { /// arrays as needed. void removeIdRange(unsigned idStart, unsigned idLimit) override; - /// Eliminates identifier at the specified position using Fourier-Motzkin + /// Eliminates the identifier at the specified position using Fourier-Motzkin /// variable elimination, but uses Gaussian elimination if there is an /// equality involving that identifier. If the result of the elimination is - /// integer exact, *isResultIntegerExact is set to true. If 'darkShadow' is + /// integer exact, `*isResultIntegerExact` is set to true. If `darkShadow` is /// set to true, a potential under approximation (subset) of the rational /// shadow / exact integer shadow is computed. // See implementation comments for more details. @@ -811,9 +812,9 @@ class FlatAffineValueConstraints : public FlatAffineConstraints { /// Values corresponding to the (column) identifiers of this constraint /// system appearing in the order the identifiers correspond to columns. - /// Temporary ones or those that aren't associated to any Value are set to + /// Temporary ones or those that aren't associated with any Value are set to /// None. - SmallVector, 8> ids; + SmallVector, 8> values; }; /// Flattens 'expr' into 'flattenedExpr', which contains the coefficients of the diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp index 293bcf8c0dd94..e3feb780bdf46 100644 --- a/mlir/lib/Analysis/AffineAnalysis.cpp +++ b/mlir/lib/Analysis/AffineAnalysis.cpp @@ -413,12 +413,12 @@ static void buildDimAndSymbolPositionMaps( // the collected values into dim and symbol parts. SmallVector srcDimValues, dstDimValues, srcSymbolValues, dstSymbolValues; - srcDomain.getIdValues(0, srcDomain.getNumDimIds(), &srcDimValues); - dstDomain.getIdValues(0, dstDomain.getNumDimIds(), &dstDimValues); - srcDomain.getIdValues(srcDomain.getNumDimIds(), - srcDomain.getNumDimAndSymbolIds(), &srcSymbolValues); - dstDomain.getIdValues(dstDomain.getNumDimIds(), - dstDomain.getNumDimAndSymbolIds(), &dstSymbolValues); + srcDomain.getValues(0, srcDomain.getNumDimIds(), &srcDimValues); + dstDomain.getValues(0, dstDomain.getNumDimIds(), &dstDimValues); + srcDomain.getValues(srcDomain.getNumDimIds(), + srcDomain.getNumDimAndSymbolIds(), &srcSymbolValues); + dstDomain.getValues(dstDomain.getNumDimIds(), + dstDomain.getNumDimAndSymbolIds(), &dstSymbolValues); // Update value position map with dim values from src iteration domain. updateValuePosMap(srcDimValues, /*isSrc=*/true, /*isDim=*/TRUE); @@ -464,11 +464,11 @@ initDependenceConstraints(const FlatAffineValueConstraints &srcDomain, // Set values corresponding to dependence constraint identifiers. SmallVector srcLoopIVs, dstLoopIVs; - srcDomain.getIdValues(0, srcDomain.getNumDimIds(), &srcLoopIVs); - dstDomain.getIdValues(0, dstDomain.getNumDimIds(), &dstLoopIVs); + srcDomain.getValues(0, srcDomain.getNumDimIds(), &srcLoopIVs); + dstDomain.getValues(0, dstDomain.getNumDimIds(), &dstLoopIVs); - dependenceConstraints->setIdValues(0, srcLoopIVs.size(), srcLoopIVs); - dependenceConstraints->setIdValues( + dependenceConstraints->setValues(0, srcLoopIVs.size(), srcLoopIVs); + dependenceConstraints->setValues( srcLoopIVs.size(), srcLoopIVs.size() + dstLoopIVs.size(), dstLoopIVs); // Set values for the symbolic identifier dimensions. `isSymbolDetermined` @@ -481,7 +481,7 @@ initDependenceConstraints(const FlatAffineValueConstraints &srcDomain, for (auto value : values) { if (isSymbolDetermined || !isForInductionVar(value)) { assert(isValidSymbol(value) && "expected symbol"); - dependenceConstraints->setIdValue(valuePosMap.getSymPos(value), value); + dependenceConstraints->setValue(valuePosMap.getSymPos(value), value); } } }; @@ -492,10 +492,10 @@ initDependenceConstraints(const FlatAffineValueConstraints &srcDomain, setSymbolIds(dstAccessMap.getOperands(), /*isSymbolDetermined=*/false); SmallVector srcSymbolValues, dstSymbolValues; - srcDomain.getIdValues(srcDomain.getNumDimIds(), - srcDomain.getNumDimAndSymbolIds(), &srcSymbolValues); - dstDomain.getIdValues(dstDomain.getNumDimIds(), - dstDomain.getNumDimAndSymbolIds(), &dstSymbolValues); + srcDomain.getValues(srcDomain.getNumDimIds(), + srcDomain.getNumDimAndSymbolIds(), &srcSymbolValues); + dstDomain.getValues(dstDomain.getNumDimIds(), + dstDomain.getNumDimAndSymbolIds(), &dstSymbolValues); // Since we only take symbol Values out of `srcDomain` and `dstDomain`, // `isSymbolDetermined` is kept to its default value: true. setSymbolIds(srcSymbolValues); @@ -503,7 +503,7 @@ initDependenceConstraints(const FlatAffineValueConstraints &srcDomain, for (unsigned i = 0, e = dependenceConstraints->getNumDimAndSymbolIds(); i < e; i++) - assert(dependenceConstraints->getIds()[i].hasValue()); + assert(dependenceConstraints->hasValue(i)); } // Adds iteration domain constraints from 'srcDomain' and 'dstDomain' into @@ -527,8 +527,8 @@ static void addDomainConstraints(const FlatAffineValueConstraints &srcDomain, return isEq ? domain.atEq(i, j) : domain.atIneq(i, j); }; auto map = [&](unsigned i) -> int64_t { - return isSrc ? valuePosMap.getSrcDimOrSymPos(domain.getIdValue(i)) - : valuePosMap.getDstDimOrSymPos(domain.getIdValue(i)); + return isSrc ? valuePosMap.getSrcDimOrSymPos(domain.getValue(i)) + : valuePosMap.getDstDimOrSymPos(domain.getValue(i)); }; for (unsigned i = 0; i < numCsts; ++i) { @@ -727,12 +727,12 @@ getNumCommonLoops(const FlatAffineValueConstraints &srcDomain, std::min(srcDomain.getNumDimIds(), dstDomain.getNumDimIds()); unsigned numCommonLoops = 0; for (unsigned i = 0; i < minNumLoops; ++i) { - if (!isForInductionVar(srcDomain.getIdValue(i)) || - !isForInductionVar(dstDomain.getIdValue(i)) || - srcDomain.getIdValue(i) != dstDomain.getIdValue(i)) + if (!isForInductionVar(srcDomain.getValue(i)) || + !isForInductionVar(dstDomain.getValue(i)) || + srcDomain.getValue(i) != dstDomain.getValue(i)) break; if (commonLoops != nullptr) - commonLoops->push_back(getForInductionVarOwner(srcDomain.getIdValue(i))); + commonLoops->push_back(getForInductionVarOwner(srcDomain.getValue(i))); ++numCommonLoops; } if (commonLoops != nullptr) @@ -768,7 +768,7 @@ static Block *getCommonBlock(const MemRefAccess &srcAccess, } return block; } - Value commonForIV = srcDomain.getIdValue(numCommonLoops - 1); + Value commonForIV = srcDomain.getValue(numCommonLoops - 1); AffineForOp forOp = getForInductionVarOwner(commonForIV); assert(forOp && "commonForValue was not an induction variable"); diff --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp index d36893bca61c6..bb636299358e3 100644 --- a/mlir/lib/Analysis/AffineStructures.cpp +++ b/mlir/lib/Analysis/AffineStructures.cpp @@ -188,7 +188,7 @@ FlatAffineConstraints::FlatAffineConstraints(IntegerSet set) // Construct from an IntegerSet. FlatAffineValueConstraints::FlatAffineValueConstraints(IntegerSet set) : FlatAffineConstraints(set) { - ids.resize(numIds, None); + values.resize(numIds, None); } void FlatAffineConstraints::reset(unsigned numReservedInequalities, @@ -210,22 +210,22 @@ void FlatAffineValueConstraints::reset(unsigned numReservedInequalities, unsigned newNumSymbols, unsigned newNumLocals) { reset(numReservedInequalities, numReservedEqualities, newNumReservedCols, - newNumDims, newNumSymbols, newNumLocals, /*idArgs=*/{}); + newNumDims, newNumSymbols, newNumLocals, /*valArgs=*/{}); } void FlatAffineValueConstraints::reset( unsigned numReservedInequalities, unsigned numReservedEqualities, unsigned newNumReservedCols, unsigned newNumDims, unsigned newNumSymbols, - unsigned newNumLocals, ArrayRef idArgs) { + unsigned newNumLocals, ArrayRef valArgs) { assert(newNumReservedCols >= newNumDims + newNumSymbols + newNumLocals + 1 && "minimum 1 column"); - SmallVector, 8> newIds; - if (!idArgs.empty()) - newIds.assign(idArgs.begin(), idArgs.end()); + SmallVector, 8> newVals; + if (!valArgs.empty()) + newVals.assign(valArgs.begin(), valArgs.end()); *this = FlatAffineValueConstraints( numReservedInequalities, numReservedEqualities, newNumReservedCols, - newNumDims, newNumSymbols, newNumLocals, newIds); + newNumDims, newNumSymbols, newNumLocals, newVals); } void FlatAffineConstraints::reset(unsigned newNumDims, unsigned newNumSymbols, @@ -237,9 +237,9 @@ void FlatAffineConstraints::reset(unsigned newNumDims, unsigned newNumSymbols, void FlatAffineValueConstraints::reset(unsigned newNumDims, unsigned newNumSymbols, unsigned newNumLocals, - ArrayRef idArgs) { + ArrayRef valArgs) { reset(0, 0, newNumDims + newNumSymbols + newNumLocals + 1, newNumDims, - newNumSymbols, newNumLocals, idArgs); + newNumSymbols, newNumLocals, valArgs); } void FlatAffineConstraints::append(const FlatAffineConstraints &other) { @@ -267,16 +267,16 @@ void FlatAffineConstraints::addDimId(unsigned pos) { addId(IdKind::Dimension, pos); } -void FlatAffineValueConstraints::addDimId(unsigned pos, Value id) { - addId(IdKind::Dimension, pos, id); +void FlatAffineValueConstraints::addDimId(unsigned pos, Value val) { + addId(IdKind::Dimension, pos, val); } void FlatAffineConstraints::addSymbolId(unsigned pos) { addId(IdKind::Symbol, pos); } -void FlatAffineValueConstraints::addSymbolId(unsigned pos, Value id) { - addId(IdKind::Symbol, pos, id); +void FlatAffineValueConstraints::addSymbolId(unsigned pos, Value val) { + addId(IdKind::Symbol, pos, val); } unsigned FlatAffineConstraints::addId(IdKind kind, unsigned pos) { @@ -306,26 +306,27 @@ unsigned FlatAffineConstraints::addId(IdKind kind, unsigned pos) { } unsigned FlatAffineValueConstraints::addId(IdKind kind, unsigned pos) { - return addId(kind, pos, /*id=*/{}); + return addId(kind, pos, /*val=*/{}); } unsigned FlatAffineValueConstraints::addId(IdKind kind, unsigned pos, - Value id) { + Value val) { unsigned absolutePos = FlatAffineConstraints::addId(kind, pos); // If an 'id' is provided, insert it; otherwise use None. - if (id) - ids.insert(ids.begin() + absolutePos, id); + if (val) + values.insert(values.begin() + absolutePos, val); else - ids.insert(ids.begin() + absolutePos, None); - assert(ids.size() == getNumIds()); + values.insert(values.begin() + absolutePos, None); + assert(values.size() == getNumIds()); return absolutePos; } -bool FlatAffineValueConstraints::hasIdValues() const { - return llvm::find_if(ids, [](Optional id) { return id.hasValue(); }) != - ids.end(); +bool FlatAffineValueConstraints::hasValues() const { + return llvm::find_if(values, [](Optional id) { + return id.hasValue(); + }) != values.end(); } /// Checks if two constraint systems are in the same space, i.e., if they are @@ -334,7 +335,8 @@ static bool areIdsAligned(const FlatAffineValueConstraints &a, const FlatAffineValueConstraints &b) { return a.getNumDimIds() == b.getNumDimIds() && a.getNumSymbolIds() == b.getNumSymbolIds() && - a.getNumIds() == b.getNumIds() && a.getIds().equals(b.getIds()); + a.getNumIds() == b.getNumIds() && + a.getMaybeValues().equals(b.getMaybeValues()); } /// Calls areIdsAligned to check if two constraint systems have the same set @@ -344,12 +346,12 @@ bool FlatAffineValueConstraints::areIdsAlignedWithOther( return areIdsAligned(*this, other); } -/// Checks if the SSA values associated with `cst''s identifiers are unique. +/// Checks if the SSA values associated with `cst`'s identifiers are unique. static bool LLVM_ATTRIBUTE_UNUSED areIdsUnique(const FlatAffineValueConstraints &cst) { SmallPtrSet uniqueIds; - for (auto id : cst.getIds()) { - if (id.hasValue() && !uniqueIds.insert(id.getValue()).second) + for (auto val : cst.getMaybeValues()) { + if (val.hasValue() && !uniqueIds.insert(val.getValue()).second) return false; } return true; @@ -368,15 +370,15 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineValueConstraints *a, FlatAffineValueConstraints *b) { assert(offset <= a->getNumDimIds() && offset <= b->getNumDimIds()); // A merge/align isn't meaningful if a cst's ids aren't distinct. - assert(areIdsUnique(*a) && "A's id values aren't unique"); - assert(areIdsUnique(*b) && "B's id values aren't unique"); + assert(areIdsUnique(*a) && "A's values aren't unique"); + assert(areIdsUnique(*b) && "B's values aren't unique"); - assert(std::all_of(a->getIds().begin() + offset, - a->getIds().begin() + a->getNumDimAndSymbolIds(), + assert(std::all_of(a->getMaybeValues().begin() + offset, + a->getMaybeValues().begin() + a->getNumDimAndSymbolIds(), [](Optional id) { return id.hasValue(); })); - assert(std::all_of(b->getIds().begin() + offset, - b->getIds().begin() + b->getNumDimAndSymbolIds(), + assert(std::all_of(b->getMaybeValues().begin() + offset, + b->getMaybeValues().begin() + b->getNumDimAndSymbolIds(), [](Optional id) { return id.hasValue(); })); // Place local id's of A after local id's of B. @@ -389,8 +391,8 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineValueConstraints *a, } SmallVector aDimValues, aSymValues; - a->getIdValues(offset, a->getNumDimIds(), &aDimValues); - a->getIdValues(a->getNumDimIds(), a->getNumDimAndSymbolIds(), &aSymValues); + a->getValues(offset, a->getNumDimIds(), &aDimValues); + a->getValues(a->getNumDimIds(), a->getNumDimAndSymbolIds(), &aSymValues); { // Merge dims from A into B. unsigned d = offset; @@ -403,7 +405,7 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineValueConstraints *a, b->swapId(d, loc); } else { b->addDimId(d); - b->setIdValue(d, aDimValue); + b->setValue(d, aDimValue); } d++; } @@ -411,7 +413,7 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineValueConstraints *a, // Dimensions that are in B, but not in A, are added at the end. for (unsigned t = a->getNumDimIds(), e = b->getNumDimIds(); t < e; t++) { a->addDimId(a->getNumDimIds()); - a->setIdValue(a->getNumDimIds() - 1, b->getIdValue(t)); + a->setValue(a->getNumDimIds() - 1, b->getValue(t)); } } { @@ -425,7 +427,7 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineValueConstraints *a, b->swapId(s, loc); } else { b->addSymbolId(s - b->getNumDimIds()); - b->setIdValue(s, aSymValue); + b->setValue(s, aSymValue); } s++; } @@ -434,7 +436,7 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineValueConstraints *a, e = b->getNumDimAndSymbolIds(); t < e; t++) { a->addSymbolId(a->getNumSymbolIds()); - a->setIdValue(a->getNumDimAndSymbolIds() - 1, b->getIdValue(t)); + a->setValue(a->getNumDimAndSymbolIds() - 1, b->getValue(t)); } } assert(areIdsAligned(*a, *b) && "IDs expected to be aligned"); @@ -520,8 +522,8 @@ void FlatAffineValueConstraints::convertLoopIVSymbolsToDims() { // Gather all symbols which are loop IVs. SmallVector loopIVs; for (unsigned i = getNumDimIds(), e = getNumDimAndSymbolIds(); i < e; i++) { - if (ids[i].hasValue() && getForInductionVarOwner(ids[i].getValue())) - loopIVs.push_back(ids[i].getValue()); + if (hasValue(i) && getForInductionVarOwner(getValue(i))) + loopIVs.push_back(getValue(i)); } // Turn each symbol in 'loopIVs' into a dim identifier. for (auto iv : loopIVs) { @@ -529,26 +531,26 @@ void FlatAffineValueConstraints::convertLoopIVSymbolsToDims() { } } -void FlatAffineValueConstraints::addInductionVarOrTerminalSymbol(Value id) { - if (containsId(id)) +void FlatAffineValueConstraints::addInductionVarOrTerminalSymbol(Value val) { + if (containsId(val)) return; // Caller is expected to fully compose map/operands if necessary. - assert((isTopLevelValue(id) || isForInductionVar(id)) && + assert((isTopLevelValue(val) || isForInductionVar(val)) && "non-terminal symbol / loop IV expected"); // Outer loop IVs could be used in forOp's bounds. - if (auto loop = getForInductionVarOwner(id)) { - addDimId(getNumDimIds(), id); + if (auto loop = getForInductionVarOwner(val)) { + addDimId(getNumDimIds(), val); if (failed(this->addAffineForOpDomain(loop))) LLVM_DEBUG( loop.emitWarning("failed to add domain info to constraint system")); return; } // Add top level symbol. - addSymbolId(getNumSymbolIds(), id); + addSymbolId(getNumSymbolIds(), val); // Check if the symbol is a constant. - if (auto constOp = id.getDefiningOp()) - setIdToConstant(id, constOp.getValue()); + if (auto constOp = val.getDefiningOp()) + setIdToConstant(val, constOp.getValue()); } LogicalResult @@ -605,17 +607,6 @@ FlatAffineValueConstraints::addAffineForOpDomain(AffineForOp forOp) { /*eq=*/false, /*lower=*/false); } -/// Adds constraints (lower and upper bounds) for each loop in the loop nest -/// described by the bound maps 'lbMaps' and 'ubMaps' of a computation slice. -/// Every pair ('lbMaps[i]', 'ubMaps[i]') describes the bounds of a loop in -/// the nest, sorted outer-to-inner. 'operands' contains the bound operands -/// for a single bound map. All the bound maps will use the same bound -/// operands. Note that some loops described by a computation slice might not -/// exist yet in the IR so the Value attached to those dimension identifiers -/// might be empty. For that reason, this method doesn't perform Value -/// look-ups to retrieve the dimension identifier positions. Instead, it -/// assumes the position of the dim identifiers in the constraint system is -/// the same as the position of the loop in the loop nest. LogicalResult FlatAffineValueConstraints::addDomainFromSliceMaps(ArrayRef lbMaps, ArrayRef ubMaps, @@ -675,7 +666,7 @@ void FlatAffineValueConstraints::addAffineIfOpDomain(AffineIfOp ifOp) { // Bind ids in the constraints to ifOp operands. SmallVector operands = ifOp.getOperands(); - cst.setIdValues(0, cst.getNumDimAndSymbolIds(), operands); + cst.setValues(0, cst.getNumDimAndSymbolIds(), operands); // Merge the constraints from ifOp to the current domain. We need first merge // and align the IDs from both constraints, and then append the constraints @@ -684,10 +675,9 @@ void FlatAffineValueConstraints::addAffineIfOpDomain(AffineIfOp ifOp) { append(cst); } -// Searches for a constraint with a non-zero coefficient at 'colIdx' in +// Searches for a constraint with a non-zero coefficient at `colIdx` in // equality (isEq=true) or inequality (isEq=false) constraints. -// Returns true and sets row found in search in 'rowIdx'. -// Returns false otherwise. +// Returns true and sets row found in search in `rowIdx`, false otherwise. static bool findConstraintWithNonZeroAt(const FlatAffineConstraints &cst, unsigned colIdx, bool isEq, unsigned *rowIdx) { @@ -704,8 +694,8 @@ static bool findConstraintWithNonZeroAt(const FlatAffineConstraints &cst, return false; } -// Normalizes the coefficient values across all columns in 'rowIDx' by their -// GCD in equality or inequality constraints as specified by 'isEq'. +// Normalizes the coefficient values across all columns in `rowIdx` by their +// GCD in equality or inequality constraints as specified by `isEq`. template static void normalizeConstraintByGCD(FlatAffineConstraints *constraints, unsigned rowIdx) { @@ -750,13 +740,9 @@ bool FlatAffineConstraints::hasConsistentState() const { bool FlatAffineValueConstraints::hasConsistentState() const { return FlatAffineConstraints::hasConsistentState() && - ids.size() == getNumIds(); + values.size() == getNumIds(); } -/// Checks all rows of equality/inequality constraints for trivial -/// contradictions (for example: 1 == 0, 0 >= 1), which may have surfaced -/// after elimination. Returns 'true' if an invalid constraint is found; -/// 'false' otherwise. bool FlatAffineConstraints::hasInvalidConstraint() const { assert(hasConsistentState()); auto check = [&](bool isEq) -> bool { @@ -787,9 +773,9 @@ bool FlatAffineConstraints::hasInvalidConstraint() const { return check(/*isEq=*/false); } -// Eliminate identifier from constraint at 'rowIdx' based on coefficient at -// pivotRow, pivotCol. Columns in range [elimColStart, pivotCol) will not be -// updated as they have already been eliminated. +/// Eliminate identifier from constraint at `rowIdx` based on coefficient at +/// pivotRow, pivotCol. Columns in range [elimColStart, pivotCol) will not be +/// updated as they have already been eliminated. static void eliminateFromConstraint(FlatAffineConstraints *constraints, unsigned rowIdx, unsigned pivotRow, unsigned pivotCol, unsigned elimColStart, @@ -822,8 +808,6 @@ static void eliminateFromConstraint(FlatAffineConstraints *constraints, } } -// Removes identifiers in column range [idStart, idLimit), and copies any -// remaining valid data into place, and updates member variables. void FlatAffineConstraints::removeIdRange(unsigned idStart, unsigned idLimit) { assert(idLimit < getNumCols() && "invalid id limit"); @@ -862,7 +846,7 @@ void FlatAffineConstraints::removeIdRange(unsigned idStart, unsigned idLimit) { void FlatAffineValueConstraints::removeIdRange(unsigned idStart, unsigned idLimit) { FlatAffineConstraints::removeIdRange(idStart, idLimit); - ids.erase(ids.begin() + idStart, ids.begin() + idLimit); + values.erase(values.begin() + idStart, values.begin() + idLimit); } /// Returns the position of the identifier that has the minimum dividend, addInequality(bound); } -bool FlatAffineValueConstraints::findId(Value id, unsigned *pos) const { +bool FlatAffineValueConstraints::findId(Value val, unsigned *pos) const { unsigned i = 0; - for (const auto &mayBeId : ids) { - if (mayBeId.hasValue() && mayBeId.getValue() == id) { + for (const auto &mayBeId : values) { + if (mayBeId.hasValue() && mayBeId.getValue() == val) { *pos = i; return true; } @@ -2178,9 +2162,9 @@ bool FlatAffineValueConstraints::findId(Value id, unsigned *pos) const { return false; } -bool FlatAffineValueConstraints::containsId(Value id) const { - return llvm::any_of(ids, [&](const Optional &mayBeId) { - return mayBeId.hasValue() && mayBeId.getValue() == id; +bool FlatAffineValueConstraints::containsId(Value val) const { + return llvm::any_of(values, [&](const Optional &mayBeId) { + return mayBeId.hasValue() && mayBeId.getValue() == val; }); } @@ -2199,7 +2183,7 @@ void FlatAffineConstraints::swapId(unsigned posA, unsigned posB) { void FlatAffineValueConstraints::swapId(unsigned posA, unsigned posB) { FlatAffineConstraints::swapId(posA, posB); - std::swap(getId(posA), getId(posB)); + std::swap(values[posA], values[posB]); } void FlatAffineConstraints::setDimSymbolSeparation(unsigned newSymbolCount) { @@ -2219,9 +2203,9 @@ void FlatAffineConstraints::setIdToConstant(unsigned pos, int64_t val) { /// Sets the specified identifier to a constant value; asserts if the id is not /// found. -void FlatAffineValueConstraints::setIdToConstant(Value id, int64_t val) { +void FlatAffineValueConstraints::setIdToConstant(Value value, int64_t val) { unsigned pos; - if (!findId(id, &pos)) + if (!findId(value, &pos)) // This is a pre-condition for this method. assert(0 && "id not found"); setIdToConstant(pos, val); @@ -2547,7 +2531,7 @@ void FlatAffineConstraints::print(raw_ostream &os) const { os << "("; for (unsigned i = 0, e = getNumIds(); i < e; i++) { if (auto *valueCstr = dyn_cast(this)) { - if (valueCstr->hasIdValue(i)) + if (valueCstr->hasValue(i)) os << "Value "; else os << "None "; @@ -2647,7 +2631,7 @@ void FlatAffineConstraints::removeTrivialRedundancy() { void FlatAffineConstraints::clearAndCopyFrom( const FlatAffineConstraints &other) { if (auto *otherValueSet = dyn_cast(&other)) - assert(!otherValueSet->hasIdValues() && + assert(!otherValueSet->hasValues() && "cannot copy associated Values into FlatAffineConstraints"); // Note: Assigment operator does not vtable pointer, so kind does not change. *this = other; @@ -2660,8 +2644,8 @@ void FlatAffineValueConstraints::clearAndCopyFrom( *this = *otherValueSet; } else { *static_cast(this) = other; - ids.clear(); - ids.resize(numIds, None); + values.clear(); + values.resize(numIds, None); } } @@ -2897,15 +2881,15 @@ void FlatAffineConstraints::fourierMotzkinEliminate( void FlatAffineValueConstraints::fourierMotzkinEliminate( unsigned pos, bool darkShadow, bool *isResultIntegerExact) { - SmallVector, 8> newIds; - newIds.reserve(numIds - 1); - newIds.append(ids.begin(), ids.begin() + pos); - newIds.append(ids.begin() + pos + 1, ids.end()); + SmallVector, 8> newVals; + newVals.reserve(numIds - 1); + newVals.append(values.begin(), values.begin() + pos); + newVals.append(values.begin() + pos + 1, values.end()); // Note: Base implementation discards all associated Values. FlatAffineConstraints::fourierMotzkinEliminate(pos, darkShadow, isResultIntegerExact); - ids = newIds; - assert(getIds().size() == getNumIds()); + values = newVals; + assert(values.size() == getNumIds()); } void FlatAffineConstraints::projectOut(unsigned pos, unsigned num) { @@ -2943,9 +2927,9 @@ void FlatAffineConstraints::projectOut(unsigned pos, unsigned num) { normalizeConstraintsByGCD(); } -void FlatAffineValueConstraints::projectOut(Value id) { +void FlatAffineValueConstraints::projectOut(Value val) { unsigned pos; - bool ret = findId(id, &pos); + bool ret = findId(val, &pos); assert(ret); (void)ret; fourierMotzkinEliminate(pos); @@ -3120,9 +3104,9 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) { LogicalResult FlatAffineValueConstraints::unionBoundingBox( const FlatAffineValueConstraints &otherCst) { assert(otherCst.getNumDimIds() == numDims && "dims mismatch"); - assert(otherCst.getIds() + assert(otherCst.getMaybeValues() .slice(0, getNumDimIds()) - .equals(getIds().slice(0, getNumDimIds())) && + .equals(getMaybeValues().slice(0, getNumDimIds())) && "dim values mismatch"); assert(otherCst.getNumLocalIds() == 0 && "local ids not supported here"); assert(getNumLocalIds() == 0 && "local ids not supported yet here"); @@ -3208,9 +3192,9 @@ void FlatAffineValueConstraints::getIneqAsAffineValueMap( // Get the values to bind to this affine expr (all dims and symbols). SmallVector operands; - getIdValues(0, pos, &operands); + getValues(0, pos, &operands); SmallVector trailingOperands; - getIdValues(pos + 1, getNumDimAndSymbolIds(), &trailingOperands); + getValues(pos + 1, getNumDimAndSymbolIds(), &trailingOperands); operands.append(trailingOperands.begin(), trailingOperands.end()); vmap.reset(AffineMap::get(numDims - 1, numSyms, boundExpr), operands); } diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp index 328a5f4967664..93b36929999d9 100644 --- a/mlir/lib/Analysis/Utils.cpp +++ b/mlir/lib/Analysis/Utils.cpp @@ -558,7 +558,7 @@ LogicalResult MemRefRegion::compute(Operation *op, unsigned loopDepth, assert(loopDepth <= enclosingIVs.size() && "invalid loop depth"); enclosingIVs.resize(loopDepth); SmallVector ids; - cst.getIdValues(cst.getNumDimIds(), cst.getNumDimAndSymbolIds(), &ids); + cst.getValues(cst.getNumDimIds(), cst.getNumDimAndSymbolIds(), &ids); for (auto id : ids) { AffineForOp iv; if ((iv = getForInductionVarOwner(id)) && @@ -762,7 +762,7 @@ static Operation *getInstAtPosition(ArrayRef positions, static LogicalResult addMissingLoopIVBounds(SmallPtrSet &ivs, FlatAffineValueConstraints *cst) { for (unsigned i = 0, e = cst->getNumDimIds(); i < e; ++i) { - auto value = cst->getIdValue(i); + auto value = cst->getValue(i); if (ivs.count(value) == 0) { assert(isForInductionVar(value)); auto loop = getForInductionVarOwner(value); @@ -877,10 +877,10 @@ mlir::computeSliceUnion(ArrayRef opsA, ArrayRef opsB, // system. SmallPtrSet sliceUnionIVs; for (unsigned k = 0, l = sliceUnionCst.getNumDimIds(); k < l; ++k) - sliceUnionIVs.insert(sliceUnionCst.getIdValue(k)); + sliceUnionIVs.insert(sliceUnionCst.getValue(k)); SmallPtrSet tmpSliceIVs; for (unsigned k = 0, l = tmpSliceCst.getNumDimIds(); k < l; ++k) - tmpSliceIVs.insert(tmpSliceCst.getIdValue(k)); + tmpSliceIVs.insert(tmpSliceCst.getValue(k)); sliceUnionCst.mergeAndAlignIdsWithOther(/*offset=*/0, &tmpSliceCst); @@ -938,13 +938,13 @@ mlir::computeSliceUnion(ArrayRef opsA, ArrayRef opsB, // Add slice bound operands of union. SmallVector sliceBoundOperands; - sliceUnionCst.getIdValues(numSliceLoopIVs, - sliceUnionCst.getNumDimAndSymbolIds(), - &sliceBoundOperands); + sliceUnionCst.getValues(numSliceLoopIVs, + sliceUnionCst.getNumDimAndSymbolIds(), + &sliceBoundOperands); // Copy src loop IVs from 'sliceUnionCst' to 'sliceUnion'. sliceUnion->ivs.clear(); - sliceUnionCst.getIdValues(0, numSliceLoopIVs, &sliceUnion->ivs); + sliceUnionCst.getValues(0, numSliceLoopIVs, &sliceUnion->ivs); // Set loop nest insertion point to block start at 'loopDepth'. sliceUnion->insertPoint = @@ -1068,8 +1068,8 @@ void mlir::getComputationSliceState( // Add slice loop IV values to 'sliceState'. unsigned offset = isBackwardSlice ? 0 : loopDepth; unsigned numSliceLoopIVs = isBackwardSlice ? numSrcLoopIVs : numDstLoopIVs; - dependenceConstraints->getIdValues(offset, offset + numSliceLoopIVs, - &sliceState->ivs); + dependenceConstraints->getValues(offset, offset + numSliceLoopIVs, + &sliceState->ivs); // Set up lower/upper bound affine maps for the slice. sliceState->lbs.resize(numSliceLoopIVs, AffineMap()); @@ -1085,7 +1085,7 @@ void mlir::getComputationSliceState( unsigned numDimsAndSymbols = dependenceConstraints->getNumDimAndSymbolIds(); for (unsigned i = 0; i < numDimsAndSymbols; ++i) { if (i < offset || i >= offset + numSliceLoopIVs) { - sliceBoundOperands.push_back(dependenceConstraints->getIdValue(i)); + sliceBoundOperands.push_back(dependenceConstraints->getValue(i)); } } diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp index b6fbdb7612599..24a202f27f491 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp @@ -769,7 +769,7 @@ hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels, return failure(); SmallVector allValues; - constraints.getAllIdValues(&allValues); + constraints.getAllValues(&allValues); SmallVector allNonLoopValues(allValues.begin() + numLoops, allValues.end()); diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp index 7026ad0166369..c19c887a593d6 100644 --- a/mlir/lib/Transforms/LoopFusion.cpp +++ b/mlir/lib/Transforms/LoopFusion.cpp @@ -921,7 +921,7 @@ static Value createPrivateMemRef(AffineForOp forOp, Operation *srcStoreOpInst, // on; this would correspond to loop IVs surrounding the level at which the // slice is being materialized. SmallVector outerIVs; - cst->getIdValues(rank, cst->getNumIds(), &outerIVs); + cst->getValues(rank, cst->getNumIds(), &outerIVs); // Build 'rank' AffineExprs from MemRefRegion 'lbs' SmallVector offsets; diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp index fcca1ae0bd0f9..2c9cd3ce6bc44 100644 --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -2206,7 +2206,7 @@ findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block, Block::iterator *copyOutPlacementStart) { const auto *cst = region.getConstraints(); SmallVector symbols; - cst->getIdValues(cst->getNumDimIds(), cst->getNumDimAndSymbolIds(), &symbols); + cst->getValues(cst->getNumDimIds(), cst->getNumDimAndSymbolIds(), &symbols); SmallVector enclosingFors; getLoopIVs(*block.begin(), &enclosingFors); @@ -2445,7 +2445,7 @@ static LogicalResult generateCopy( // on; these typically include loop IVs surrounding the level at which the // copy generation is being done or other valid symbols in MLIR. SmallVector regionSymbols; - cst->getIdValues(rank, cst->getNumIds(), ®ionSymbols); + cst->getValues(rank, cst->getNumIds(), ®ionSymbols); // Construct the index expressions for the fast memory buffer. The index // expression for a particular dimension of the fast buffer is obtained by @@ -2689,7 +2689,7 @@ static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, SmallVector symbols; extractForInductionVars(ivs, &symbols); regionCst->reset(rank, numParamLoopIVs, 0); - regionCst->setIdValues(rank, rank + numParamLoopIVs, symbols); + regionCst->setValues(rank, rank + numParamLoopIVs, symbols); // Memref dim sizes provide the bounds. for (unsigned d = 0; d < rank; d++) { @@ -3068,7 +3068,7 @@ static AffineIfOp createSeparationCondition(MutableArrayRef loops, return nullptr; SmallVector setOperands; - cst.getIdValues(0, cst.getNumDimAndSymbolIds(), &setOperands); + cst.getValues(0, cst.getNumDimAndSymbolIds(), &setOperands); canonicalizeSetAndOperands(&ifCondSet, &setOperands); return b.create(loops[0].getLoc(), ifCondSet, setOperands, /*withElseRegion=*/true); From 99c790dc21b80cceae6084f6cec8c66e75c8d390 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Tue, 17 Aug 2021 10:24:49 +0900 Subject: [PATCH 174/700] [AMDGPU] Make BVH isel consistent with other MIMG opcodes Suffix opcodes with _gfx10. Remove direct references to architecture specific opcodes. Add a BVH flag and apply this to diassembly. Fix a number of disassembly errors on gfx90a target caused by previous incorrect BVH detection code. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D108117 --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 31 +++++---- .../Disassembler/AMDGPUDisassembler.cpp | 15 ++-- llvm/lib/Target/AMDGPU/MIMGInstructions.td | 36 ++++++---- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 34 ++++++---- .../Target/AMDGPU/SILoadStoreOptimizer.cpp | 12 +--- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 4 ++ .../Disassembler/AMDGPU/gfx90a_ldst_acc.txt | 68 +++++++++---------- .../MC/Disassembler/AMDGPU/mimg_gfx90a.txt | 6 +- 8 files changed, 112 insertions(+), 94 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 5a8c14234bb77..71e120974c2e2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4825,18 +4825,25 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; - const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); - const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); - const unsigned Opcodes[2][2][2] = { - {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa, - AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa}, - {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa, - AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}}, - {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa, - AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa}, - {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa, - AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}}; - const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64]; + const unsigned NumVDataDwords = 4; + const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); + const bool UseNSA = + ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); + const unsigned BaseOpcodes[2][2] = { + {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, + {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, + AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; + int Opcode; + if (UseNSA) { + Opcode = + AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, + NumVDataDwords, NumVAddrDwords); + } else { + Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], + AMDGPU::MIMGEncGfx10Default, NumVDataDwords, + PowerOf2Ceil(NumVAddrDwords)); + } + assert(Opcode != -1); SmallVector Ops; if (Is64) { diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index fe62b8590fa0e..555f6bc5cd960 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -693,22 +693,21 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { int D16Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::d16); + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); + assert(VDataIdx != -1); - if (DMaskIdx == -1 || TFEIdx == -1) {// intersect_ray + if (BaseOpcode->BVH) { + // Add A16 operand for intersect_ray instructions if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16) > -1) { - assert(MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa || - MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa || - MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa || - MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa); addOperand(MI, MCOperand::createImm(1)); } return MCDisassembler::Success; } - const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); bool IsAtomic = (VDstIdx != -1); bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4; - bool IsNSA = false; unsigned AddrSize = Info->VAddrDwords; @@ -717,8 +716,6 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim); int A16Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16); - const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); const AMDGPU::MIMGDimInfo *Dim = AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm()); const bool IsA16 = (A16Idx != -1 && MI.getOperand(A16Idx).getImm()); diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index bacb790aac62f..288cf6b02f9ff 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -43,6 +43,7 @@ class MIMGBaseOpcode : PredicateControl { bit HasD16 = 0; bit IsAtomicRet = 0; bit MSAA = 0; + bit BVH = 0; } def MIMGBaseOpcode : GenericEnum { @@ -54,7 +55,7 @@ def MIMGBaseOpcodesTable : GenericTable { let CppTypeName = "MIMGBaseOpcodeInfo"; let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates", - "LodOrClampOrMip", "HasD16", "MSAA"]; + "LodOrClampOrMip", "HasD16", "MSAA", "BVH"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; let PrimaryKey = ["BaseOpcode"]; @@ -872,6 +873,14 @@ multiclass MIMG_Gather : MIMG_Gather; +class MIMG_IntersectRay_Helper { + int num_addrs = !if(Is64, !if(A16, 9, 12), !if(A16, 8, 11)); + // TODO: MIMGAddrSize will choose VReg_512 which is a 16 register tuple, + // when we only need 9, 11 or 12 depending on A16 field and ptr size. + RegisterClass RegClass = MIMGAddrSize.RegClass; + int VAddrDwords = !srl(RegClass.Size, 5); +} + class MIMG_IntersectRay_gfx10 : MIMG_gfx10 { @@ -890,8 +899,11 @@ class MIMG_IntersectRay_nsa_gfx10 { - def "" : MIMGBaseOpcode; +multiclass MIMG_IntersectRay { + defvar info = MIMG_IntersectRay_Helper; + def "" : MIMGBaseOpcode { + let BVH = 1; + } let SubtargetPredicate = HasGFX10_AEncoding, AssemblerPredicate = HasGFX10_AEncoding, AsmMatchConverter = !if(A16, "cvtIntersectRay", ""), @@ -908,13 +920,11 @@ multiclass MIMG_IntersectRay d16 = 0, BaseOpcode = !cast(NAME), VDataDwords = 4 in { - // TODO: MIMGAddrSize will choose VReg_512 which is a 16 register tuple, - // when we only need 9, 11 or 12 depending on A16 field and ptr size. - def "_sa" : MIMG_IntersectRay_gfx10.RegClass, A16> { - let VAddrDwords = !srl(MIMGAddrSize.RegClass.Size, 5); + def _sa_gfx10 : MIMG_IntersectRay_gfx10 { + let VAddrDwords = info.VAddrDwords; } - def _nsa : MIMG_IntersectRay_nsa_gfx10 { - let VAddrDwords = num_addrs; + def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10 { + let VAddrDwords = info.num_addrs; } } } @@ -1045,10 +1055,10 @@ defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler , AMDGPUSample_c_cd let SubtargetPredicate = HasGFX10_AEncoding in defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler , "image_msaa_load", 1, 0, 0, 1>; -defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh_intersect_ray", 11, 0>; -defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh_intersect_ray", 8, 1>; -defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh64_intersect_ray", 12, 0>; -defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh64_intersect_ray", 9, 1>; +defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh_intersect_ray", 0, 0>; +defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh_intersect_ray", 0, 1>; +defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh64_intersect_ray", 1, 0>; +defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh64_intersect_ray", 1, 1>; /********** ========================================= **********/ /********** Table of dimension-aware image intrinsics **********/ diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0d21d77d60470..b16cf1f3bed3e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7373,19 +7373,25 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; const bool Is64 = NodePtr.getValueType() == MVT::i64; - const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); - const bool UseNSA = - Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize(); - const unsigned Opcodes[2][2][2] = { - {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa, - AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa}, - {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa, - AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}}, - {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa, - AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa}, - {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa, - AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}}; - const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64]; + const unsigned NumVDataDwords = 4; + const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); + const bool UseNSA = Subtarget->hasNSAEncoding() && + NumVAddrDwords <= Subtarget->getNSAMaxSize(); + const unsigned BaseOpcodes[2][2] = { + {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, + {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, + AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; + int Opcode; + if (UseNSA) { + Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], + AMDGPU::MIMGEncGfx10NSA, NumVDataDwords, + NumVAddrDwords); + } else { + Opcode = AMDGPU::getMIMGOpcode( + BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10Default, NumVDataDwords, + PowerOf2Ceil(NumVAddrDwords)); + } + assert(Opcode != -1); SmallVector Ops; @@ -7428,7 +7434,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (!UseNSA) { // Build a single vector containing all the operands so far prepared. - if (NumVAddrs > 8) { + if (NumVAddrDwords > 8) { SDValue Undef = DAG.getUNDEF(MVT::i32); Ops.append(16 - Ops.size(), Undef); } diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 493c1ad87f93d..5dd621856a721 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -343,6 +343,9 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) return UNKNOWN; + // Ignore BVH instructions + if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) + return UNKNOWN; // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc)) @@ -380,15 +383,6 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::DS_WRITE_B64: case AMDGPU::DS_WRITE_B64_gfx9: return DS_WRITE; - case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa: - case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa: - case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa: - case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa: - case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa: - case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa: - case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa: - case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa: - return UNKNOWN; } } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 72c872dec5ba7..5bd9f85fab993 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -292,8 +292,12 @@ struct MIMGBaseOpcodeInfo { bool LodOrClampOrMip; bool HasD16; bool MSAA; + bool BVH; }; +LLVM_READONLY +const MIMGBaseOpcodeInfo *getMIMGBaseOpcode(unsigned Opc); + LLVM_READONLY const MIMGBaseOpcodeInfo *getMIMGBaseOpcodeInfo(unsigned BaseOpcode); diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt index 0b65c52d66d8d..0688cd71537de 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt @@ -7869,40 +7869,40 @@ # GFX90A: image_load a5, v2, s[8:15] dmask:0x2 ; encoding: [0x00,0x02,0x01,0xf0,0x02,0x05,0x02,0x00] 0x00,0x02,0x01,0xf0,0x02,0x05,0x02,0x00 -# GFX90A: image_load a6, v2, s[8:15] dmask:0x3 ; encoding: [0x00,0x03,0x01,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_load a[6:7], v2, s[8:15] dmask:0x3 ; encoding: [0x00,0x03,0x01,0xf0,0x02,0x06,0x02,0x00] 0x00,0x03,0x01,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_load a5, v2, s[8:15] dmask:0x4 ; encoding: [0x00,0x04,0x01,0xf0,0x02,0x05,0x02,0x00] 0x00,0x04,0x01,0xf0,0x02,0x05,0x02,0x00 -# GFX90A: image_load a6, v2, s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x01,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_load a[6:7], v2, s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x01,0xf0,0x02,0x06,0x02,0x00] 0x00,0x05,0x01,0xf0,0x02,0x06,0x02,0x00 -# GFX90A: image_load a6, v2, s[8:15] dmask:0x6 ; encoding: [0x00,0x06,0x01,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_load a[6:7], v2, s[8:15] dmask:0x6 ; encoding: [0x00,0x06,0x01,0xf0,0x02,0x06,0x02,0x00] 0x00,0x06,0x01,0xf0,0x02,0x06,0x02,0x00 -# GFX90A: image_load a6, v2, s[8:15] dmask:0x7 ; encoding: [0x00,0x07,0x01,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_load a[6:8], v2, s[8:15] dmask:0x7 ; encoding: [0x00,0x07,0x01,0xf0,0x02,0x06,0x02,0x00] 0x00,0x07,0x01,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_load a5, v2, s[8:15] dmask:0x8 ; encoding: [0x00,0x08,0x01,0xf0,0x02,0x05,0x02,0x00] 0x00,0x08,0x01,0xf0,0x02,0x05,0x02,0x00 -# GFX90A: image_load a6, v2, s[8:15] dmask:0x9 ; encoding: [0x00,0x09,0x01,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_load a[6:7], v2, s[8:15] dmask:0x9 ; encoding: [0x00,0x09,0x01,0xf0,0x02,0x06,0x02,0x00] 0x00,0x09,0x01,0xf0,0x02,0x06,0x02,0x00 -# GFX90A: image_load a6, v2, s[8:15] dmask:0xa ; encoding: [0x00,0x0a,0x01,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_load a[6:7], v2, s[8:15] dmask:0xa ; encoding: [0x00,0x0a,0x01,0xf0,0x02,0x06,0x02,0x00] 0x00,0x0a,0x01,0xf0,0x02,0x06,0x02,0x00 -# GFX90A: image_load a6, v2, s[8:15] dmask:0xb ; encoding: [0x00,0x0b,0x01,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_load a[6:8], v2, s[8:15] dmask:0xb ; encoding: [0x00,0x0b,0x01,0xf0,0x02,0x06,0x02,0x00] 0x00,0x0b,0x01,0xf0,0x02,0x06,0x02,0x00 -# GFX90A: image_load a6, v2, s[8:15] dmask:0xc ; encoding: [0x00,0x0c,0x01,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_load a[6:7], v2, s[8:15] dmask:0xc ; encoding: [0x00,0x0c,0x01,0xf0,0x02,0x06,0x02,0x00] 0x00,0x0c,0x01,0xf0,0x02,0x06,0x02,0x00 -# GFX90A: image_load a6, v2, s[8:15] dmask:0xd ; encoding: [0x00,0x0d,0x01,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_load a[6:8], v2, s[8:15] dmask:0xd ; encoding: [0x00,0x0d,0x01,0xf0,0x02,0x06,0x02,0x00] 0x00,0x0d,0x01,0xf0,0x02,0x06,0x02,0x00 -# GFX90A: image_load a6, v2, s[8:15] dmask:0xe ; encoding: [0x00,0x0e,0x01,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_load a[6:8], v2, s[8:15] dmask:0xe ; encoding: [0x00,0x0e,0x01,0xf0,0x02,0x06,0x02,0x00] 0x00,0x0e,0x01,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_load a5, v2, s[8:15] ; encoding: [0x00,0x00,0x01,0xf0,0x02,0x05,0x02,0x00] @@ -7944,43 +7944,43 @@ # GFX90A: image_store a1, v2, s[12:19] dmask:0x2 unorm ; encoding: [0x00,0x12,0x21,0xf0,0x02,0x01,0x03,0x00] 0x00,0x12,0x21,0xf0,0x02,0x01,0x03,0x00 -# GFX90A: image_store a2, v2, s[12:19] dmask:0x3 unorm ; encoding: [0x00,0x13,0x21,0xf0,0x02,0x02,0x03,0x00] +# GFX90A: image_store a[2:3], v2, s[12:19] dmask:0x3 unorm ; encoding: [0x00,0x13,0x21,0xf0,0x02,0x02,0x03,0x00] 0x00,0x13,0x21,0xf0,0x02,0x02,0x03,0x00 # GFX90A: image_store a1, v2, s[12:19] dmask:0x4 unorm ; encoding: [0x00,0x14,0x21,0xf0,0x02,0x01,0x03,0x00] 0x00,0x14,0x21,0xf0,0x02,0x01,0x03,0x00 -# GFX90A: image_store a2, v2, s[12:19] dmask:0x5 unorm ; encoding: [0x00,0x15,0x21,0xf0,0x02,0x02,0x03,0x00] +# GFX90A: image_store a[2:3], v2, s[12:19] dmask:0x5 unorm ; encoding: [0x00,0x15,0x21,0xf0,0x02,0x02,0x03,0x00] 0x00,0x15,0x21,0xf0,0x02,0x02,0x03,0x00 -# GFX90A: image_store a2, v2, s[12:19] dmask:0x6 unorm ; encoding: [0x00,0x16,0x21,0xf0,0x02,0x02,0x03,0x00] +# GFX90A: image_store a[2:3], v2, s[12:19] dmask:0x6 unorm ; encoding: [0x00,0x16,0x21,0xf0,0x02,0x02,0x03,0x00] 0x00,0x16,0x21,0xf0,0x02,0x02,0x03,0x00 -# GFX90A: image_store a2, v2, s[12:19] dmask:0x7 unorm ; encoding: [0x00,0x17,0x21,0xf0,0x02,0x02,0x03,0x00] +# GFX90A: image_store a[2:4], v2, s[12:19] dmask:0x7 unorm ; encoding: [0x00,0x17,0x21,0xf0,0x02,0x02,0x03,0x00] 0x00,0x17,0x21,0xf0,0x02,0x02,0x03,0x00 # GFX90A: image_store a1, v2, s[12:19] dmask:0x8 unorm ; encoding: [0x00,0x18,0x21,0xf0,0x02,0x01,0x03,0x00] 0x00,0x18,0x21,0xf0,0x02,0x01,0x03,0x00 -# GFX90A: image_store a2, v2, s[12:19] dmask:0x9 unorm ; encoding: [0x00,0x19,0x21,0xf0,0x02,0x02,0x03,0x00] +# GFX90A: image_store a[2:3], v2, s[12:19] dmask:0x9 unorm ; encoding: [0x00,0x19,0x21,0xf0,0x02,0x02,0x03,0x00] 0x00,0x19,0x21,0xf0,0x02,0x02,0x03,0x00 -# GFX90A: image_store a2, v2, s[12:19] dmask:0xa unorm ; encoding: [0x00,0x1a,0x21,0xf0,0x02,0x02,0x03,0x00] +# GFX90A: image_store a[2:3], v2, s[12:19] dmask:0xa unorm ; encoding: [0x00,0x1a,0x21,0xf0,0x02,0x02,0x03,0x00] 0x00,0x1a,0x21,0xf0,0x02,0x02,0x03,0x00 -# GFX90A: image_store a2, v2, s[12:19] dmask:0xb unorm ; encoding: [0x00,0x1b,0x21,0xf0,0x02,0x02,0x03,0x00] +# GFX90A: image_store a[2:4], v2, s[12:19] dmask:0xb unorm ; encoding: [0x00,0x1b,0x21,0xf0,0x02,0x02,0x03,0x00] 0x00,0x1b,0x21,0xf0,0x02,0x02,0x03,0x00 -# GFX90A: image_store a2, v2, s[12:19] dmask:0xc unorm ; encoding: [0x00,0x1c,0x21,0xf0,0x02,0x02,0x03,0x00] +# GFX90A: image_store a[2:3], v2, s[12:19] dmask:0xc unorm ; encoding: [0x00,0x1c,0x21,0xf0,0x02,0x02,0x03,0x00] 0x00,0x1c,0x21,0xf0,0x02,0x02,0x03,0x00 -# GFX90A: image_store a2, v2, s[12:19] dmask:0xd unorm ; encoding: [0x00,0x1d,0x21,0xf0,0x02,0x02,0x03,0x00] +# GFX90A: image_store a[2:4], v2, s[12:19] dmask:0xd unorm ; encoding: [0x00,0x1d,0x21,0xf0,0x02,0x02,0x03,0x00] 0x00,0x1d,0x21,0xf0,0x02,0x02,0x03,0x00 -# GFX90A: image_store a2, v2, s[12:19] dmask:0xe unorm ; encoding: [0x00,0x1e,0x21,0xf0,0x02,0x02,0x03,0x00] +# GFX90A: image_store a[2:4], v2, s[12:19] dmask:0xe unorm ; encoding: [0x00,0x1e,0x21,0xf0,0x02,0x02,0x03,0x00] 0x00,0x1e,0x21,0xf0,0x02,0x02,0x03,0x00 -# GFX90A: image_store a2, v2, s[12:19] dmask:0xf unorm ; encoding: [0x00,0x1f,0x21,0xf0,0x02,0x02,0x03,0x00] +# GFX90A: image_store a[2:5], v2, s[12:19] dmask:0xf unorm ; encoding: [0x00,0x1f,0x21,0xf0,0x02,0x02,0x03,0x00] 0x00,0x1f,0x21,0xf0,0x02,0x02,0x03,0x00 # GFX90A: image_store a1, v2, s[12:19] unorm ; encoding: [0x00,0x10,0x21,0xf0,0x02,0x01,0x03,0x00] @@ -8016,7 +8016,7 @@ # GFX90A: image_atomic_swap a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x17,0x00] 0x00,0x11,0x41,0xf0,0x02,0x05,0x17,0x00 -# GFX90A: image_atomic_swap a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x41,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_swap a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x41,0xf0,0x02,0x06,0x02,0x00] 0x00,0x13,0x41,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_swap a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x41,0xf0,0x02,0x05,0x02,0x00] @@ -8046,7 +8046,7 @@ # GFX90A: image_atomic_cmpswap a[6:7], v2, s[92:99] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x17,0x00] 0x00,0x13,0x45,0xf0,0x02,0x06,0x17,0x00 -# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0xf unorm ; encoding: [0x00,0x1f,0x45,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_cmpswap a[6:9], v2, s[8:15] dmask:0xf unorm ; encoding: [0x00,0x1f,0x45,0xf0,0x02,0x06,0x02,0x00] 0x00,0x1f,0x45,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x45,0xf0,0x02,0x06,0x02,0x00] @@ -8076,7 +8076,7 @@ # GFX90A: image_atomic_add a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x17,0x00] 0x00,0x11,0x49,0xf0,0x02,0x05,0x17,0x00 -# GFX90A: image_atomic_add a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x49,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_add a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x49,0xf0,0x02,0x06,0x02,0x00] 0x00,0x13,0x49,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_add a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x49,0xf0,0x02,0x05,0x02,0x00] @@ -8106,7 +8106,7 @@ # GFX90A: image_atomic_sub a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x17,0x00] 0x00,0x11,0x4d,0xf0,0x02,0x05,0x17,0x00 -# GFX90A: image_atomic_sub a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x4d,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_sub a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x4d,0xf0,0x02,0x06,0x02,0x00] 0x00,0x13,0x4d,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_sub a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4d,0xf0,0x02,0x05,0x02,0x00] @@ -8136,7 +8136,7 @@ # GFX90A: image_atomic_smin a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x17,0x00] 0x00,0x11,0x51,0xf0,0x02,0x05,0x17,0x00 -# GFX90A: image_atomic_smin a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x51,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_smin a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x51,0xf0,0x02,0x06,0x02,0x00] 0x00,0x13,0x51,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_smin a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x51,0xf0,0x02,0x05,0x02,0x00] @@ -8166,7 +8166,7 @@ # GFX90A: image_atomic_umin a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x17,0x00] 0x00,0x11,0x55,0xf0,0x02,0x05,0x17,0x00 -# GFX90A: image_atomic_umin a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x55,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_umin a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x55,0xf0,0x02,0x06,0x02,0x00] 0x00,0x13,0x55,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_umin a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x55,0xf0,0x02,0x05,0x02,0x00] @@ -8196,7 +8196,7 @@ # GFX90A: image_atomic_smax a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x17,0x00] 0x00,0x11,0x59,0xf0,0x02,0x05,0x17,0x00 -# GFX90A: image_atomic_smax a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x59,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_smax a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x59,0xf0,0x02,0x06,0x02,0x00] 0x00,0x13,0x59,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_smax a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x59,0xf0,0x02,0x05,0x02,0x00] @@ -8226,7 +8226,7 @@ # GFX90A: image_atomic_umax a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x17,0x00] 0x00,0x11,0x5d,0xf0,0x02,0x05,0x17,0x00 -# GFX90A: image_atomic_umax a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x5d,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_umax a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x5d,0xf0,0x02,0x06,0x02,0x00] 0x00,0x13,0x5d,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_umax a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x5d,0xf0,0x02,0x05,0x02,0x00] @@ -8256,7 +8256,7 @@ # GFX90A: image_atomic_and a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x17,0x00] 0x00,0x11,0x61,0xf0,0x02,0x05,0x17,0x00 -# GFX90A: image_atomic_and a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x61,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_and a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x61,0xf0,0x02,0x06,0x02,0x00] 0x00,0x13,0x61,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_and a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x61,0xf0,0x02,0x05,0x02,0x00] @@ -8286,7 +8286,7 @@ # GFX90A: image_atomic_or a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x17,0x00] 0x00,0x11,0x65,0xf0,0x02,0x05,0x17,0x00 -# GFX90A: image_atomic_or a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x65,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_or a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x65,0xf0,0x02,0x06,0x02,0x00] 0x00,0x13,0x65,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_or a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x65,0xf0,0x02,0x05,0x02,0x00] @@ -8316,7 +8316,7 @@ # GFX90A: image_atomic_xor a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x17,0x00] 0x00,0x11,0x69,0xf0,0x02,0x05,0x17,0x00 -# GFX90A: image_atomic_xor a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x69,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_xor a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x69,0xf0,0x02,0x06,0x02,0x00] 0x00,0x13,0x69,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_xor a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x69,0xf0,0x02,0x05,0x02,0x00] @@ -8346,7 +8346,7 @@ # GFX90A: image_atomic_inc a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x17,0x00] 0x00,0x11,0x6d,0xf0,0x02,0x05,0x17,0x00 -# GFX90A: image_atomic_inc a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x6d,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_inc a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x6d,0xf0,0x02,0x06,0x02,0x00] 0x00,0x13,0x6d,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_inc a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x6d,0xf0,0x02,0x05,0x02,0x00] @@ -8376,7 +8376,7 @@ # GFX90A: image_atomic_dec a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x17,0x00] 0x00,0x11,0x71,0xf0,0x02,0x05,0x17,0x00 -# GFX90A: image_atomic_dec a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x71,0xf0,0x02,0x06,0x02,0x00] +# GFX90A: image_atomic_dec a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x71,0xf0,0x02,0x06,0x02,0x00] 0x00,0x13,0x71,0xf0,0x02,0x06,0x02,0x00 # GFX90A: image_atomic_dec a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x71,0xf0,0x02,0x05,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt b/llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt index ced068d669caf..b902fca9288ce 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt @@ -1,6 +1,6 @@ # RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX90A -# GFX90A: image_load v4, v238, s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x00,0xf0,0xee,0x04,0x07,0x00] +# GFX90A: image_load v[4:6], v238, s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x00,0xf0,0xee,0x04,0x07,0x00] 0x00,0x17,0x00,0xf0,0xee,0x04,0x07,0x00 # GFX90A: image_load_pck v5, v0, s[8:15] dmask:0x1 glc ; encoding: [0x00,0x21,0x08,0xf0,0x00,0x05,0x02,0x00] @@ -15,10 +15,10 @@ # GFX90A: image_load_mip_pck v5, v1, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x10,0xf0,0x01,0x05,0x02,0x00] 0x00,0x01,0x10,0xf0,0x01,0x05,0x02,0x00 -# GFX90A: image_load_mip_pck_sgn v4, v0, s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x14,0xf0,0x00,0x04,0x02,0x00] +# GFX90A: image_load_mip_pck_sgn v[4:5], v0, s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x14,0xf0,0x00,0x04,0x02,0x00] 0x00,0x05,0x14,0xf0,0x00,0x04,0x02,0x00 -# GFX90A: image_store v192, v238, s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x20,0xf0,0xee,0xc0,0x07,0x00] +# GFX90A: image_store v[192:194], v238, s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x20,0xf0,0xee,0xc0,0x07,0x00] 0x00,0x17,0x20,0xf0,0xee,0xc0,0x07,0x00 # GFX90A: image_store_pck v1, v2, s[12:19] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x28,0xf0,0x02,0x01,0x03,0x00] From 08d55c5c0156a3759d375711771fccfa3e9a79a5 Mon Sep 17 00:00:00 2001 From: Vincent Lee Date: Mon, 16 Aug 2021 15:19:43 -0700 Subject: [PATCH 175/700] [lld-macho] Refactor parseSections to avoid creating isec on LLVM segments Address post follow up comment in D108016. Avoid creating isec for LLVM segments since we are skipping over it. Reviewed By: #lld-macho, int3 Differential Revision: https://reviews.llvm.org/D108167 --- lld/MachO/InputFiles.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp index 32279da1cf627..8d64f2731d8d8 100644 --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -279,6 +279,14 @@ void ObjFile::parseSections(ArrayRef
sections) { {off, make(segname, name, this, data.slice(off, literalSize), align, flags)}); + } else if (segname == segment_names::llvm) { + // ld64 does not appear to emit contents from sections within the __LLVM + // segment. Symbols within those sections point to bitcode metadata + // instead of actual symbols. Global symbols within those sections could + // have the same name without causing duplicate symbol errors. Push an + // empty map to ensure indices line up for the remaining sections. + // TODO: Evaluate whether the bitcode metadata is needed. + subsections.push_back({}); } else { auto *isec = make(segname, name, this, data, align, flags); @@ -290,14 +298,6 @@ void ObjFile::parseSections(ArrayRef
sections) { // empty map to ensure the indices line up for the remaining sections. subsections.push_back({}); debugSections.push_back(isec); - } else if (isec->getSegName() == segment_names::llvm) { - // ld64 does not appear to emit contents from sections within the __LLVM - // segment. Symbols within those sections point to bitcode metadata - // instead of actual symbols. Global symbols within those sections could - // have the same name without causing duplicate symbol errors. Push an - // empty map to ensure indices line up for the remaining sections. - // TODO: Evaluate whether the bitcode metadata is needed. - subsections.push_back({}); } else { subsections.push_back({{0, isec}}); } From 5821047aaca23184ee30fb212bfbbfa25967448b Mon Sep 17 00:00:00 2001 From: John Demme Date: Mon, 16 Aug 2021 19:18:23 -0700 Subject: [PATCH 176/700] [MLIR] [Python] Fix out-of-tree Windows python bindings MSVC needs to know where to put the archive (.lib) as well as the runtime (.dll). If left to the default location, multiple rules to generate the same file will be produced, creating a Ninja error. Differential Revision: https://reviews.llvm.org/D108181 --- mlir/cmake/modules/AddMLIRPython.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake index 48390f45ff90c..b1e2f0b3f5559 100644 --- a/mlir/cmake/modules/AddMLIRPython.cmake +++ b/mlir/cmake/modules/AddMLIRPython.cmake @@ -494,6 +494,7 @@ function(add_mlir_python_extension libname extname) set_target_properties( ${libname} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${ARG_OUTPUT_DIRECTORY} + ARCHIVE_OUTPUT_DIRECTORY ${ARG_OUTPUT_DIRECTORY} ) endif() From 54e76cb17abdfef1b3549fff00f51e72ae966b4c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 16 Aug 2021 19:23:11 -0700 Subject: [PATCH 177/700] [split-file] Default to --no-leading-lines It turns out that the --leading-lines may be a bad default. [[#@LINE+-num]] is rarely used. --- lld/test/ELF/linkerscript/overwrite-sections.test | 2 +- llvm/test/tools/split-file/basic.test | 6 +++--- llvm/tools/split-file/split-file.cpp | 12 ++++++++---- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/lld/test/ELF/linkerscript/overwrite-sections.test b/lld/test/ELF/linkerscript/overwrite-sections.test index 06918728826db..0a14542f7d10d 100644 --- a/lld/test/ELF/linkerscript/overwrite-sections.test +++ b/lld/test/ELF/linkerscript/overwrite-sections.test @@ -1,5 +1,5 @@ # REQUIRES: x86 -# RUN: rm -rf %t && split-file %s %t +# RUN: rm -rf %t && split-file --leading-lines %s %t # RUN: llvm-mc -filetype=obj -triple=x86_64 %t/a.s -o %t/a.o ## There is no main linker script. OVERWRITE_SECTIONS defines output section diff --git a/llvm/test/tools/split-file/basic.test b/llvm/test/tools/split-file/basic.test index 5d32c3429ed37..b47aeb1e11801 100644 --- a/llvm/test/tools/split-file/basic.test +++ b/llvm/test/tools/split-file/basic.test @@ -9,20 +9,20 @@ cc //--- end # RUN: rm -rf %t -# RUN: split-file %s %t +# RUN: split-file --leading-lines %s %t # RUN: diff %S/Inputs/basic-aa.txt %t/aa # RUN: diff %S/Inputs/basic-bb.txt %t/bb # RUN: diff %S/Inputs/basic-cc.txt %t/subdir/cc # RUN: FileCheck %s --check-prefix=END < %t/end ## Can be called on a non-empty directory. -# RUN: split-file %s %t +# RUN: split-file --leading-lines %s %t # RUN: diff %S/Inputs/basic-aa.txt %t/aa ## Test that we will delete the output if it is a file, so that we can create ## a directory. # RUN: rm -rf %t && touch %t -# RUN: split-file %s %t +# RUN: split-file --leading-lines %s %t # RUN: diff %S/Inputs/basic-aa.txt %t/aa # END: RUN: split-file %s %t diff --git a/llvm/tools/split-file/split-file.cpp b/llvm/tools/split-file/split-file.cpp index 355678433ac16..bde7d21a51e9a 100644 --- a/llvm/tools/split-file/split-file.cpp +++ b/llvm/tools/split-file/split-file.cpp @@ -35,8 +35,12 @@ static cl::opt input(cl::Positional, cl::desc("filename"), static cl::opt output(cl::Positional, cl::desc("directory"), cl::value_desc("directory"), cl::cat(cat)); +static cl::opt leadingLines("leading-lines", + cl::desc("Preserve line numbers"), + cl::cat(cat)); + static cl::opt noLeadingLines("no-leading-lines", - cl::desc("Don't preserve line numbers"), + cl::desc("Don't preserve line numbers (default)"), cl::cat(cat)); static StringRef toolName; @@ -96,9 +100,9 @@ static int handle(MemoryBuffer &inputBuf, StringRef input) { Part &cur = res.first->second; if (!i.is_at_eof()) cur.begin = i->data(); - // If --no-leading-lines is not specified, numEmptyLines is 0. Append - // newlines so that the extracted part preserves line numbers. - cur.leadingLines = noLeadingLines ? 0 : i.line_number() - 1; + // If --leading-lines is specified, numEmptyLines is 0. Append newlines so + // that the extracted part preserves line numbers. + cur.leadingLines = leadingLines ? i.line_number() - 1 : 0; lastPart = partName; } From f74b70ef57fd038e9e6a781ef5cc72bb9734abe4 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 16 Aug 2021 19:41:11 -0700 Subject: [PATCH 178/700] [lld-macho][test] Remove ld64.lld: prefix in a diagnostic The convention is not to check the prefix before `error: `. This gives flexibility if we need to rename ld64.lld to something else, (e.g. a while ago we used ld64.lld.darwinnew). --- lld/test/MachO/discard-llvm-sections.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/test/MachO/discard-llvm-sections.s b/lld/test/MachO/discard-llvm-sections.s index 571fdbb0b2da0..a05456dc96672 100644 --- a/lld/test/MachO/discard-llvm-sections.s +++ b/lld/test/MachO/discard-llvm-sections.s @@ -23,7 +23,7 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin --defsym TEXT=0 %t/bar.s -o %t/bar.o # RUN: not %lld -dylib %t/foo.o %t/bar.o -o %t/libDuplicate.dylib 2>&1 | FileCheck %s --check-prefix=DUP -# DUP: ld64.lld: error: duplicate symbol: _llvm.foo +# DUP: error: duplicate symbol: _llvm.foo #--- foo.s .globl _llvm.foo From 686607676f720a27b5946d3cb7800e18181a312f Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Thu, 12 Aug 2021 22:54:42 -0400 Subject: [PATCH 179/700] [AMDGPU] Skip pseudo MIs in hazard recognizer Instructions like WAVE_BARRIER and SI_MASKED_UNREACHABLE are only placeholders to prevent certain unwanted transformations and will get discarded during assembly emission. They should not be counted during nop insertion. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D108022 --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 4 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 +++ .../AMDGPU/hazard-pseudo-machineinstrs.mir | 45 ++++++++++++++++++ llvm/test/CodeGen/AMDGPU/hazard.mir | 46 +++++++++++++++++++ 4 files changed, 102 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index bc2fb1e9770c8..7b5ced3ff3a55 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -363,6 +363,10 @@ void GCNHazardRecognizer::AdvanceCycle() { } unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); + if (!NumWaitStates) { + CurrCycleInstr = nullptr; + return; + } // Keep track of emitted instructions EmittedInstrs.push_front(CurrCycleInstr); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 7ab0f7a100c5e..2cdd98fe00602 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1641,6 +1641,13 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { case AMDGPU::S_NOP: return MI.getOperand(0).getImm() + 1; + + // FIXME: Any other pseudo instruction? + // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The + // hazard, even if one exist, won't really be visible. Should we handle it? + case AMDGPU::SI_MASKED_UNREACHABLE: + case AMDGPU::WAVE_BARRIER: + return 0; } } diff --git a/llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir b/llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir new file mode 100644 index 0000000000000..b477c9b5ba90b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir @@ -0,0 +1,45 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-sched %s -o - | FileCheck -check-prefix=GCN %s + +# WAVE_BARRIER and SI_MASKED_UNREACHABLE are not really instructions. +# To fix the hazard (m0 def followed by V_INTERP), the scheduler +# should move another instruction into the slot. +--- +# CHECK-LABEL: name: hazard_wave_barrier +# CHECK-LABEL: bb.0: +# GCN: $m0 = S_MOV_B32 killed renamable $sgpr0 +# GCN-NEXT: WAVE_BARRIER +# GCN-NEXT: S_MOV_B32 0 +# GCN-NEXT: V_INTERP_MOV_F32 +name: hazard_wave_barrier +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + $m0 = S_MOV_B32 killed renamable $sgpr0 + WAVE_BARRIER + renamable $vgpr0 = V_INTERP_MOV_F32 2, 0, 0, implicit $mode, implicit $m0, implicit $exec + renamable $sgpr1 = S_MOV_B32 0 + S_ENDPGM 0 + +... +# GCN-LABEL: name: hazard-masked-unreachable +# CHECK-LABEL: bb.0: +# GCN: $m0 = S_MOV_B32 killed renamable $sgpr0 +# GCN-NEXT: SI_MASKED_UNREACHABLE +# GCN-NEXT: S_MOV_B32 0 +# GCN-NEXT: V_INTERP_MOV_F32 +--- +name: hazard-masked-unreachable +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + + $m0 = S_MOV_B32 killed renamable $sgpr0 + SI_MASKED_UNREACHABLE + renamable $vgpr0 = V_INTERP_MOV_F32 2, 0, 0, implicit $mode, implicit $m0, implicit $exec + renamable $sgpr1 = S_MOV_B32 0 + bb.1: + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/hazard.mir b/llvm/test/CodeGen/AMDGPU/hazard.mir index 1b53aac3646be..5bc4c62569a25 100644 --- a/llvm/test/CodeGen/AMDGPU/hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard.mir @@ -125,3 +125,49 @@ body: | S_SENDMSG 3, implicit $exec, implicit $m0 S_ENDPGM 0 ... +# GCN-LABEL: name: hazard-lookahead-wave-barrier +# GCN: S_WAITCNT 0 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_ADD_F16_dpp +--- +name: hazard-lookahead-wave-barrier +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr3 + + renamable $vgpr1 = contract nofpexcept V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec + WAVE_BARRIER + S_WAITCNT 0 + renamable $vgpr2 = contract nofpexcept V_ADD_F16_dpp undef $vgpr2, 0, $vgpr1, 0, $vgpr3, 273, 15, 15, 1, implicit $mode, implicit $exec +... +# GCN-LABEL: name: hazard-lookahead-masked-unreachable +# GCN: SI_MASKED_UNREACHABLE +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_SENDMSG +--- +name: hazard-lookahead-masked-unreachable +body: | + bb.0: + $m0 = S_MOV_B32 -1 + SI_MASKED_UNREACHABLE + S_SENDMSG 3, implicit $exec, implicit $m0 + + bb.1: + S_ENDPGM 0 +... +# GCN-LABEL: name: fallthrough-hazard-lookahead-masked-unreachable +# GCN: SI_MASKED_UNREACHABLE +# GCN-LABEL: bb.1: +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: S_SENDMSG +--- +name: fallthrough-hazard-lookahead-masked-unreachable +body: | + bb.0: + $m0 = S_MOV_B32 -1 + SI_MASKED_UNREACHABLE + + bb.1: + S_SENDMSG 3, implicit $exec, implicit $m0 + S_ENDPGM 0 +... From b31199bab4865deef4e778d7a028c8ec64285654 Mon Sep 17 00:00:00 2001 From: Ben Shi Date: Sat, 7 Aug 2021 16:29:27 +0800 Subject: [PATCH 180/700] [AVR][clang] Improve search for avr-libc installation path Search avr-libc path according to avr-gcc installation at first, then other possible installed pathes. Reviewed By: MaskRay Differential Revision: https://reviews.llvm.org/D107682 --- clang/lib/Driver/ToolChains/AVR.cpp | 14 ++++++++++++-- .../basic_avr_tree_2/opt/local/avr/include/.keep | 0 .../basic_avr_tree_2/opt/local/avr/lib/libavr.a | 0 .../opt/local/lib/gcc/avr/10.3.0/libgcc.a | 0 .../Inputs/basic_avr_tree_2/usr/avr/include/.keep | 0 .../Inputs/basic_avr_tree_2/usr/avr/lib/libavr.a | 0 clang/test/Driver/avr-toolchain.c | 14 +++++++++++++- 7 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/avr/include/.keep create mode 100644 clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/avr/lib/libavr.a create mode 100644 clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/lib/gcc/avr/10.3.0/libgcc.a create mode 100644 clang/test/Driver/Inputs/basic_avr_tree_2/usr/avr/include/.keep create mode 100644 clang/test/Driver/Inputs/basic_avr_tree_2/usr/avr/lib/libavr.a diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp index cebf9d13a4ce0..5a12406a51cc6 100644 --- a/clang/lib/Driver/ToolChains/AVR.cpp +++ b/clang/lib/Driver/ToolChains/AVR.cpp @@ -453,11 +453,21 @@ void AVR::Linker::ConstructJob(Compilation &C, const JobAction &JA, } llvm::Optional AVRToolChain::findAVRLibcInstallation() const { + // Search avr-libc installation according to avr-gcc installation. + std::string GCCParent(GCCInstallation.getParentLibPath()); + std::string Path(GCCParent + "/avr"); + if (llvm::sys::fs::is_directory(Path)) + return Path; + Path = GCCParent + "/../avr"; + if (llvm::sys::fs::is_directory(Path)) + return Path; + + // Search avr-libc installation from possible locations, and return the first + // one that exists, if there is no avr-gcc installed. for (StringRef PossiblePath : PossibleAVRLibcLocations) { std::string Path = getDriver().SysRoot + PossiblePath.str(); - // Return the first avr-libc installation that exists. if (llvm::sys::fs::is_directory(Path)) - return Optional(Path); + return Path; } return llvm::None; diff --git a/clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/avr/include/.keep b/clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/avr/include/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/avr/lib/libavr.a b/clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/avr/lib/libavr.a new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/lib/gcc/avr/10.3.0/libgcc.a b/clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/lib/gcc/avr/10.3.0/libgcc.a new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/basic_avr_tree_2/usr/avr/include/.keep b/clang/test/Driver/Inputs/basic_avr_tree_2/usr/avr/include/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/basic_avr_tree_2/usr/avr/lib/libavr.a b/clang/test/Driver/Inputs/basic_avr_tree_2/usr/avr/lib/libavr.a new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/avr-toolchain.c b/clang/test/Driver/avr-toolchain.c index 74eb6ff37aed1..6bb7a00b6a3b5 100644 --- a/clang/test/Driver/avr-toolchain.c +++ b/clang/test/Driver/avr-toolchain.c @@ -6,12 +6,24 @@ // CHECK1-SAME: "-resource-dir" "[[RESOURCE:[^"]+]]" // CHECK1-SAME: "-isysroot" "[[SYSROOT:[^"]+/basic_avr_tree]]" // CHECK1-SAME: "-internal-isystem" -// CHECK1-SAME: {{^}} "[[SYSROOT]]/usr/lib/avr/include" +// CHECK1-SAME: {{^}} "[[SYSROOT]]/usr/lib/gcc/avr/5.4.0/../../../avr/include" // CHECK1-NOT: "-L // CHECK1: avr-ld" // CHECK1-SAME: "-o" "a.out" // CHECK1-SAME: {{^}} "--gc-sections" +// RUN: %clang %s -### -target avr --sysroot %S/Inputs/basic_avr_tree_2/opt/local -S 2>&1 | FileCheck --check-prefix=CHECK2 %s +// CHECK2: clang{{.*}} "-cc1" "-triple" "avr" +// CHECK2-SAME: "-isysroot" "[[SYSROOT:[^"]+/basic_avr_tree_2/opt/local]]" +// CHECK2-SAME: "-internal-isystem" +// CHECK2-SAME: {{^}} "[[SYSROOT]]/lib/gcc/avr/10.3.0/../../../../avr/include" + +// RUN: %clang %s -### -target avr --sysroot %S/Inputs/basic_avr_tree_2 -S 2>&1 | FileCheck --check-prefix=CHECK3 %s +// CHECK3: clang{{.*}} "-cc1" "-triple" "avr" +// CHECK3-SAME: "-isysroot" "[[SYSROOT:[^"]+/basic_avr_tree_2]]" +// CHECK3-SAME: "-internal-isystem" +// CHECK3-SAME: {{^}} "[[SYSROOT]]/usr/avr/include" + // RUN: %clang %s -### -target avr 2>&1 | FileCheck -check-prefix=CC1 %s // CC1: clang{{.*}} "-cc1" "-triple" "avr" {{.*}} "-fno-use-init-array" From a41c95c0e3c24076df7b9449e350e04c5c073126 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Tue, 17 Aug 2021 12:50:13 +0900 Subject: [PATCH 181/700] [LNICM] Fix infinite loop There is a bug introduced by https://reviews.llvm.org/D107219 which causes an infinite loop, when there are more than 2 levels PHINode chain. Reviewed By: uint256_t Differential Revision: https://reviews.llvm.org/D108166 --- llvm/lib/Transforms/Scalar/LICM.cpp | 2 +- llvm/test/Transforms/LICM/lnicm-sink.ll | 66 +++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 9e5e7d2a5935b..5e48809ff1aef 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -1491,7 +1491,7 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, UI->getNumOperands() == 1) { if (!CurLoop->contains(UI)) break; - UI = cast(U->user_back()); + UI = cast(UI->user_back()); } } } diff --git a/llvm/test/Transforms/LICM/lnicm-sink.ll b/llvm/test/Transforms/LICM/lnicm-sink.ll index 0dc47e5f1a4a0..a94f5f39ab426 100644 --- a/llvm/test/Transforms/LICM/lnicm-sink.ll +++ b/llvm/test/Transforms/LICM/lnicm-sink.ll @@ -61,6 +61,72 @@ for.end7: ret double %t.0.lcssa } +; double sin(double); +; int abs(int); +; double test(double x, int y[10]) { +; double t = 0; int s = 0; +; for (int i = 0; i < 10; i++) { +; for (int k = 0; k < 10; k++) { +; for (int j = 0; j < 10; j++) { +; t = sin(x); +; s = abs(i); +; } +; } +; y[i] = s; +; } +; return t; +; } +; +define dso_local double @test2(double %x, i32* noalias %y) { +entry: + br label %for.body + +for.body: + %i.02 = phi i32 [ 0, %entry ], [ %inc6, %for.end ] + br label %for.k + +for.k: + %k = phi i64 [ 0, %for.body ], [ %inc.k, %for.end.k ] + br label %for.body3 + +; CHECK: for.body3: +; LNICM: call i32 @abs(i32 %i.02) +; LICM-NOT: call i32 @abs(i32 %i.02) +for.body3: + %j.01 = phi i32 [ 0, %for.k ], [ %inc, %for.body3 ] + %call = call double @sin(double %x) + %call4 = call i32 @abs(i32 %i.02) + %inc = add nsw i32 %j.01, 1 + %cmp2 = icmp slt i32 %inc, 10 + br i1 %cmp2, label %for.body3, label %for.end.k + +for.end.k: + %s.lcssa.k = phi i32 [ %call4, %for.body3 ] + %t.lcssa.k = phi double [ %call, %for.body3 ] + %inc.k = add nsw i64 %k, 1 + %cmp.k = icmp slt i64 %inc.k, 10 + br i1 %cmp.k, label %for.k, label %for.end + +; CHECK: for.end: +; LICM: call i32 @abs(i32 %i.02) +; LNICM-NOT: call i32 @abs(i32 %i.02) +for.end: + %s.1.lcssa = phi i32 [ %s.lcssa.k, %for.end.k ] + %t.1.lcssa = phi double [ %t.lcssa.k, %for.end.k ] + %idxprom = sext i32 %i.02 to i64 + %arrayidx = getelementptr inbounds i32, i32* %y, i64 %idxprom + store i32 %s.1.lcssa, i32* %arrayidx, align 4 + %inc6 = add nsw i32 %i.02, 1 + %cmp = icmp slt i32 %inc6, 10 + br i1 %cmp, label %for.body, label %for.end7 + +; CHECK: for.end7: +; CHECK: call double @sin(double %x) +for.end7: + %t.0.lcssa = phi double [ %t.1.lcssa, %for.end ] + ret double %t.0.lcssa +} + declare dso_local double @sin(double) #0 declare dso_local i32 @abs(i32) #0 From 8f5e9d65d65bc0473915f2d0d3d22305f0583de3 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 16 Aug 2021 21:21:11 -0700 Subject: [PATCH 182/700] [AsmParser] Remove MDConstant (NFC) The last use was removed on Sep 22, 2016 in commit fcee2d80017f8e2db6a8ac3a70bdc0653afa7d01. --- llvm/lib/AsmParser/LLParser.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 64af1dc3751e4..746393d9bd216 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -3876,10 +3876,6 @@ struct MDField : public MDFieldImpl { MDField(bool AllowNull = true) : ImplTy(nullptr), AllowNull(AllowNull) {} }; -struct MDConstant : public MDFieldImpl { - MDConstant() : ImplTy(nullptr) {} -}; - struct MDStringField : public MDFieldImpl { bool AllowEmpty; MDStringField(bool AllowEmpty = true) From 9790a2a72f60bb2caf891658c3c6a02b61e1f1a2 Mon Sep 17 00:00:00 2001 From: Yunde Zhong Date: Tue, 17 Aug 2021 13:02:23 +0800 Subject: [PATCH 183/700] [tests] precommit tests for D107692 --- llvm/test/CodeGen/AArch64/arm64-srl-and.ll | 29 ++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/arm64-srl-and.ll diff --git a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll new file mode 100644 index 0000000000000..2f024e444d25f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -O3 < %s | FileCheck %s + +; Disable the dagcombine if operand has multi use + +@g = global i16 0, align 4 +define i32 @srl_and() { +; CHECK-LABEL: srl_and: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, :got:g +; CHECK-NEXT: ldr x8, [x8, :got_lo12:g] +; CHECK-NEXT: mov w9, #50 +; CHECK-NEXT: ldrh w8, [x8] +; CHECK-NEXT: eor w8, w8, w9 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: and w0, w8, w8, lsr #16 +; CHECK-NEXT: ret +entry: + %0 = load i16, i16* @g, align 4 + %1 = xor i16 %0, 50 + %tobool = icmp ne i16 %1, 0 + %lor.ext = zext i1 %tobool to i32 + %sub = add i16 %1, -1 + + %srl = zext i16 %sub to i32 + %and = and i32 %srl, %lor.ext + + ret i32 %and +} From 198e6771e24fb5d8d9f3b155445ecc2a4f211004 Mon Sep 17 00:00:00 2001 From: Deep Majumder Date: Tue, 17 Aug 2021 10:42:30 +0530 Subject: [PATCH 184/700] [analyzer] Add option to SATest.py for extra checkers This patch adds the flag `extra-checkers` to the sub-command `build` for passing a comma separated list of additional checkers to include. Differential Revision: https://reviews.llvm.org/D106739 --- clang/utils/analyzer/SATest.py | 5 +++++ clang/utils/analyzer/SATestBuild.py | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/clang/utils/analyzer/SATest.py b/clang/utils/analyzer/SATest.py index 176fe40a2b171..9931870b3b0dd 100755 --- a/clang/utils/analyzer/SATest.py +++ b/clang/utils/analyzer/SATest.py @@ -42,6 +42,7 @@ def build(parser, args): projects, args.override_compiler, args.extra_analyzer_config, + args.extra_checkers, args.regenerate, args.strictness) tests_passed = tester.test_all() @@ -250,6 +251,10 @@ def main(): dest="extra_analyzer_config", type=str, default="", help="Arguments passed to to -analyzer-config") + build_parser.add_argument("--extra-checkers", + dest="extra_checkers", type=str, + default="", + help="Extra checkers to enable") build_parser.add_argument("--projects", action="store", default="", help="Comma-separated list of projects to test") build_parser.add_argument("--max-size", action="store", default=None, diff --git a/clang/utils/analyzer/SATestBuild.py b/clang/utils/analyzer/SATestBuild.py index ed5c7379bb5b4..1977a8fc2aeff 100644 --- a/clang/utils/analyzer/SATestBuild.py +++ b/clang/utils/analyzer/SATestBuild.py @@ -213,6 +213,7 @@ class TestInfo(NamedTuple): project: ProjectInfo override_compiler: bool = False extra_analyzer_config: str = "" + extra_checkers: str = "" is_reference_build: bool = False strictness: int = 0 @@ -233,13 +234,16 @@ class RegressionTester: """ A component aggregating all of the project testing. """ + def __init__(self, jobs: int, projects: List[ProjectInfo], override_compiler: bool, extra_analyzer_config: str, + extra_checkers: str, regenerate: bool, strictness: bool): self.jobs = jobs self.projects = projects self.override_compiler = override_compiler self.extra_analyzer_config = extra_analyzer_config + self.extra_checkers = extra_checkers self.regenerate = regenerate self.strictness = strictness @@ -252,6 +256,7 @@ def test_all(self) -> bool: TestInfo(project, self.override_compiler, self.extra_analyzer_config, + self.extra_checkers, self.regenerate, self.strictness)) if self.jobs <= 1: return self._single_threaded_test_all(projects_to_test) @@ -305,10 +310,12 @@ class ProjectTester: """ A component aggregating testing for one project. """ + def __init__(self, test_info: TestInfo, silent: bool = False): self.project = test_info.project self.override_compiler = test_info.override_compiler self.extra_analyzer_config = test_info.extra_analyzer_config + self.extra_checkers = test_info.extra_checkers self.is_reference_build = test_info.is_reference_build self.strictness = test_info.strictness self.silent = silent @@ -414,6 +421,8 @@ def scan_build(self, directory: str, output_dir: str, if 'SA_ADDITIONAL_CHECKERS' in os.environ: all_checkers = (all_checkers + ',' + os.environ['SA_ADDITIONAL_CHECKERS']) + if self.extra_checkers != "": + all_checkers += "," + self.extra_checkers # Run scan-build from within the patched source directory. cwd = os.path.join(directory, PATCHED_SOURCE_DIR_NAME) From 1689dade4218945db175f7916c2261667f9bf371 Mon Sep 17 00:00:00 2001 From: John Demme Date: Mon, 16 Aug 2021 22:37:14 -0700 Subject: [PATCH 185/700] [MLIR] [Python] Allow 'operation.parent' to return 'None' This is more Pythonic and better matches the C++ and C APIs. Reviewed By: stellaraccident Differential Revision: https://reviews.llvm.org/D108183 --- mlir/lib/Bindings/Python/IRCore.cpp | 16 ++++++++++------ mlir/lib/Bindings/Python/IRModule.h | 3 ++- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 1f81550005250..3e927ceec190f 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -868,22 +868,23 @@ py::object PyOperationBase::getAsm(bool binary, return fileObject.attr("getvalue")(); } -PyOperationRef PyOperation::getParentOperation() { +llvm::Optional PyOperation::getParentOperation() { checkValid(); if (!isAttached()) throw SetPyError(PyExc_ValueError, "Detached operations have no parent"); MlirOperation operation = mlirOperationGetParentOperation(get()); if (mlirOperationIsNull(operation)) - throw SetPyError(PyExc_ValueError, "Operation has no parent."); + return {}; return PyOperation::forOperation(getContext(), operation); } PyBlock PyOperation::getBlock() { checkValid(); - PyOperationRef parentOperation = getParentOperation(); + llvm::Optional parentOperation = getParentOperation(); MlirBlock block = mlirOperationGetBlock(get()); assert(!mlirBlockIsNull(block) && "Attached operation has null parent"); - return PyBlock{std::move(parentOperation), block}; + assert(parentOperation && "Operation has no parent"); + return PyBlock{std::move(*parentOperation), block}; } py::object PyOperation::getCapsule() { @@ -2121,8 +2122,11 @@ void mlir::python::populateIRCore(py::module &m) { py::arg("loc") = py::none(), py::arg("ip") = py::none(), kOperationCreateDocstring) .def_property_readonly("parent", - [](PyOperation &self) { - return self.getParentOperation().getObject(); + [](PyOperation &self) -> py::object { + auto parent = self.getParentOperation(); + if (parent) + return parent->getObject(); + return py::none(); }) .def("erase", &PyOperation::erase) .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h index 79c480e9446f5..9d217c872191d 100644 --- a/mlir/lib/Bindings/Python/IRModule.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -18,6 +18,7 @@ #include "mlir-c/IR.h" #include "mlir-c/IntegerSet.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h" namespace mlir { namespace python { @@ -452,7 +453,7 @@ class PyOperation : public PyOperationBase, public BaseContextObject { /// Gets the parent operation or raises an exception if the operation has /// no parent. - PyOperationRef getParentOperation(); + llvm::Optional getParentOperation(); /// Gets a capsule wrapping the void* within the MlirOperation. pybind11::object getCapsule(); From ce8272afb3fde5599d6d3827576aa9edda0da322 Mon Sep 17 00:00:00 2001 From: Riccardo Mori Date: Tue, 17 Aug 2021 09:34:28 +0200 Subject: [PATCH 186/700] [Polly][Isl] Use isl::val::sub instead of isl::val::sub_ui. NFC This is part of an effort to reduce the differences between the custom C++ bindings used right now by polly in `lib/External/isl/include/isl/isl-noxceptions.h` and the official isl C++ interface. Changes made: - Use `isl::val::sub` instead of `isl::val::sub_ui` - `isl-noexceptions.h` has been generated by https://github.com/patacca/isl/commit/355e84163ae78ff637c71fb532f36d15277a2b1b Depends on D107225 Reviewed By: Meinersbur Differential Revision: https://reviews.llvm.org/D107293 --- polly/lib/Analysis/ScopInfo.cpp | 2 +- polly/lib/External/isl/include/isl/isl-noexceptions.h | 7 ------- polly/unittests/Isl/IslTest.cpp | 6 +++--- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp index 965776b8b3457..659961215b951 100644 --- a/polly/lib/Analysis/ScopInfo.cpp +++ b/polly/lib/Analysis/ScopInfo.cpp @@ -195,7 +195,7 @@ static isl::set addRangeBoundsToSet(isl::set S, const ConstantRange &Range, isl::set SLB = S.lower_bound_val(type, dim, V); V = valFromAPInt(Ctx.get(), Range.getUpper(), true); - V = V.sub_ui(1); + V = V.sub(1); isl::set SUB = S.upper_bound_val(type, dim, V); S = SLB.unite(SUB); } diff --git a/polly/lib/External/isl/include/isl/isl-noexceptions.h b/polly/lib/External/isl/include/isl/isl-noexceptions.h index 5142941900ce8..4e81211c77094 100644 --- a/polly/lib/External/isl/include/isl/isl-noexceptions.h +++ b/polly/lib/External/isl/include/isl/isl-noexceptions.h @@ -4801,7 +4801,6 @@ class val { inline int sgn() const; inline isl::val sub(isl::val v2) const; inline isl::val sub(long v2) const; - inline isl::val sub_ui(unsigned long v2) const; inline isl::val_list to_list() const; inline isl::val trunc() const; static inline isl::val zero(isl::ctx ctx); @@ -22787,12 +22786,6 @@ isl::val val::sub(long v2) const return this->sub(isl::val(ctx(), v2)); } -isl::val val::sub_ui(unsigned long v2) const -{ - auto res = isl_val_sub_ui(copy(), v2); - return manage(res); -} - isl::val_list val::to_list() const { auto res = isl_val_to_list(copy()); diff --git a/polly/unittests/Isl/IslTest.cpp b/polly/unittests/Isl/IslTest.cpp index 72c0a85133b90..42730af278dd2 100644 --- a/polly/unittests/Isl/IslTest.cpp +++ b/polly/unittests/Isl/IslTest.cpp @@ -136,7 +136,7 @@ TEST(Isl, APIntToIslVal) { { APInt APNOne(32, (1ull << 32) - 1, false); auto IslNOne = valFromAPInt(IslCtx, APNOne, false); - auto IslRef = isl::val(IslCtx, 32).pow2().sub_ui(1); + auto IslRef = isl::val(IslCtx, 32).pow2().sub(1); EXPECT_EQ(IslNOne, IslRef); } @@ -223,7 +223,7 @@ TEST(Isl, IslValToAPInt) { } { - auto IslNOne = isl::val(IslCtx, 32).pow2().sub_ui(1); + auto IslNOne = isl::val(IslCtx, 32).pow2().sub(1); auto APNOne = APIntFromVal(IslNOne); EXPECT_EQ((1ull << 32) - 1, APNOne); EXPECT_EQ(33u, APNOne.getBitWidth()); @@ -232,7 +232,7 @@ TEST(Isl, IslValToAPInt) { { auto IslLargeNum = isl::val(IslCtx, 60); IslLargeNum = IslLargeNum.pow2(); - IslLargeNum = IslLargeNum.sub_ui(1); + IslLargeNum = IslLargeNum.sub(1); auto APLargeNum = APIntFromVal(IslLargeNum); EXPECT_EQ((1ull << 60) - 1, APLargeNum); EXPECT_EQ(61u, APLargeNum.getBitWidth()); From 4f21e6aeddc2dbe4ae22aba5b97cae0c50c961f6 Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Fri, 13 Aug 2021 16:27:46 +0000 Subject: [PATCH 187/700] [flang][nfc] Tweak the FrontendAction class This patch refactors the `FrontendAction` class. It merely moves code around so that re-using it is easier. No new functionality is introduced. 1. Three new member methods are introduced: `RunPrescan`, `RunParse`, `RunSemanticChecks`. 2. The following free functions are re-implemented as member methods: * `reportFatalSemanticErrors` * `reportFatalScanningErrors` * `reportFatalParsingErrors` * `reportFatalErrors` `reportFatalSemanticErrors` is updated to resemble the other error reporting functions and to make the API more consistent. 3. The `BeginSourceFileAction` methods are simplified and the unused input argument is deleted. Differential Revision: https://reviews.llvm.org/D108130 --- flang/include/flang/Frontend/FrontendAction.h | 30 +++- .../include/flang/Frontend/FrontendActions.h | 6 +- flang/lib/Frontend/FrontendAction.cpp | 95 +++++++++- flang/lib/Frontend/FrontendActions.cpp | 164 +++--------------- 4 files changed, 147 insertions(+), 148 deletions(-) diff --git a/flang/include/flang/Frontend/FrontendAction.h b/flang/include/flang/Frontend/FrontendAction.h index 87e82fe5274f5..aac1fcf268a08 100644 --- a/flang/include/flang/Frontend/FrontendAction.h +++ b/flang/include/flang/Frontend/FrontendAction.h @@ -43,7 +43,7 @@ class FrontendAction { /// /// \return True on success; on failure ExecutionAction() and /// EndSourceFileAction() will not be called. - virtual bool BeginSourceFileAction(CompilerInstance &ci) { return true; } + virtual bool BeginSourceFileAction() { return true; } /// @} @@ -100,6 +100,34 @@ class FrontendAction { /// Perform any per-file post processing, deallocate per-file /// objects, and run statistics and output file cleanup code. void EndSourceFile(); + + /// @} +protected: + // Prescan the current input file. Return False if fatal errors are reported, + // True otherwise. + bool RunPrescan(); + // Parse the current input file. Return False if fatal errors are reported, + // True otherwise. + bool RunParse(); + // Run semantic checks for the current input file. Return False if fatal + // errors are reported, True otherwise. + bool RunSemanticChecks(); + + // Report fatal semantic errors. Return True if present, false otherwise. + bool reportFatalSemanticErrors(); + + // Report fatal scanning errors. Return True if present, false otherwise. + inline bool reportFatalScanningErrors() { + return reportFatalErrors("Could not scan %0"); + } + + // Report fatal parsing errors. Return True if present, false otherwise + inline bool reportFatalParsingErrors() { + return reportFatalErrors("Could not parse %0"); + } + +private: + template bool reportFatalErrors(const char (&message)[N]); }; } // namespace Fortran::frontend diff --git a/flang/include/flang/Frontend/FrontendActions.h b/flang/include/flang/Frontend/FrontendActions.h index 9cfaabcf7677e..43fd1f0f65965 100644 --- a/flang/include/flang/Frontend/FrontendActions.h +++ b/flang/include/flang/Frontend/FrontendActions.h @@ -51,7 +51,7 @@ class InitOnlyAction : public FrontendAction { //===----------------------------------------------------------------------===// class PrescanAction : public FrontendAction { void ExecuteAction() override = 0; - bool BeginSourceFileAction(CompilerInstance &ci) override; + bool BeginSourceFileAction() override; }; class PrintPreprocessedAction : public PrescanAction { @@ -75,7 +75,7 @@ class DebugMeasureParseTreeAction : public PrescanAction { //===----------------------------------------------------------------------===// class PrescanAndParseAction : public FrontendAction { void ExecuteAction() override = 0; - bool BeginSourceFileAction(CompilerInstance &ci) override; + bool BeginSourceFileAction() override; }; class DebugUnparseNoSemaAction : public PrescanAndParseAction { @@ -92,7 +92,7 @@ class DebugDumpParseTreeNoSemaAction : public PrescanAndParseAction { class PrescanAndSemaAction : public FrontendAction { void ExecuteAction() override = 0; - bool BeginSourceFileAction(CompilerInstance &ci) override; + bool BeginSourceFileAction() override; }; class DebugUnparseWithSymbolsAction : public PrescanAndSemaAction { diff --git a/flang/lib/Frontend/FrontendAction.cpp b/flang/lib/Frontend/FrontendAction.cpp index 77700d2abec78..5285681e2904a 100644 --- a/flang/lib/Frontend/FrontendAction.cpp +++ b/flang/lib/Frontend/FrontendAction.cpp @@ -89,7 +89,7 @@ bool FrontendAction::BeginSourceFile( invoc.fortranOpts().isFixedForm = currentInput().IsFixedForm(); } - if (!BeginSourceFileAction(ci)) { + if (!BeginSourceFileAction()) { BeginSourceFileCleanUp(*this, ci); return false; } @@ -117,3 +117,96 @@ void FrontendAction::EndSourceFile() { set_instance(nullptr); set_currentInput(FrontendInputFile()); } + +bool FrontendAction::RunPrescan() { + CompilerInstance &ci = this->instance(); + std::string currentInputPath{GetCurrentFileOrBufferName()}; + Fortran::parser::Options parserOptions = ci.invocation().fortranOpts(); + + if (ci.invocation().frontendOpts().fortranForm == FortranForm::Unknown) { + // Switch between fixed and free form format based on the input file + // extension. + // + // Ideally we should have all Fortran options set before entering this + // method (i.e. before processing any specific input files). However, we + // can't decide between fixed and free form based on the file extension + // earlier than this. + parserOptions.isFixedForm = currentInput().IsFixedForm(); + } + + // Prescan. In case of failure, report and return. + ci.parsing().Prescan(currentInputPath, parserOptions); + + return !reportFatalScanningErrors(); +} + +bool FrontendAction::RunParse() { + CompilerInstance &ci = this->instance(); + + // Parse. In case of failure, report and return. + ci.parsing().Parse(llvm::outs()); + + if (reportFatalParsingErrors()) { + return false; + } + + // Report the diagnostics from parsing + ci.parsing().messages().Emit(llvm::errs(), ci.allCookedSources()); + + return true; +} + +bool FrontendAction::RunSemanticChecks() { + CompilerInstance &ci = this->instance(); + std::optional &parseTree{ci.parsing().parseTree()}; + assert(parseTree && "Cannot run semantic checks without a parse tree!"); + + // Prepare semantics + ci.setSemantics(std::make_unique( + ci.invocation().semanticsContext(), *parseTree, + ci.invocation().debugModuleDir())); + auto &semantics = ci.semantics(); + + // Run semantic checks + semantics.Perform(); + + if (reportFatalSemanticErrors()) { + return false; + } + + // Report the diagnostics from the semantic checks + semantics.EmitMessages(ci.semaOutputStream()); + + return true; +} + +template +bool FrontendAction::reportFatalErrors(const char (&message)[N]) { + if (!instance_->parsing().messages().empty() && + (instance_->invocation().warnAsErr() || + instance_->parsing().messages().AnyFatalError())) { + const unsigned diagID = instance_->diagnostics().getCustomDiagID( + clang::DiagnosticsEngine::Error, message); + instance_->diagnostics().Report(diagID) << GetCurrentFileOrBufferName(); + instance_->parsing().messages().Emit( + llvm::errs(), instance_->allCookedSources()); + return true; + } + return false; +} + +bool FrontendAction::reportFatalSemanticErrors() { + auto &diags = instance_->diagnostics(); + auto &sema = instance_->semantics(); + + if (instance_->semantics().AnyFatalError()) { + unsigned DiagID = diags.getCustomDiagID( + clang::DiagnosticsEngine::Error, "Semantic errors in %0"); + diags.Report(DiagID) << GetCurrentFileOrBufferName(); + sema.EmitMessages(instance_->semaOutputStream()); + + return true; + } + + return false; +} diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index acd6b049dfe8d..f5ff2095281cd 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -27,132 +27,22 @@ using namespace Fortran::frontend; -/// Report fatal semantic errors if present. -/// -/// \param semantics The semantics instance -/// \param diags The diagnostics engine instance -/// \param bufferName The file or buffer name -/// -/// \return True if fatal semantic errors are present, false if not -bool reportFatalSemanticErrors(const Fortran::semantics::Semantics &semantics, - clang::DiagnosticsEngine &diags, const llvm::StringRef &bufferName) { - if (semantics.AnyFatalError()) { - unsigned DiagID = diags.getCustomDiagID( - clang::DiagnosticsEngine::Error, "Semantic errors in %0"); - diags.Report(DiagID) << bufferName; - return true; - } - return false; -} - -template -static bool reportFatalErrors( - const FrontendAction *act, const char (&message)[N]) { - CompilerInstance &ci = act->instance(); - if (!ci.parsing().messages().empty() && - (ci.invocation().warnAsErr() || - ci.parsing().messages().AnyFatalError())) { - const unsigned diagID = ci.diagnostics().getCustomDiagID( - clang::DiagnosticsEngine::Error, message); - ci.diagnostics().Report(diagID) << act->GetCurrentFileOrBufferName(); - ci.parsing().messages().Emit(llvm::errs(), ci.allCookedSources()); - return true; - } - return false; -} - -inline bool reportFatalScanningErrors(const FrontendAction *act) { - return reportFatalErrors(act, "Could not scan %0"); -} - -inline bool reportFatalParsingErrors(const FrontendAction *act) { - return reportFatalErrors(act, "Could not parse %0"); -} - -bool PrescanAction::BeginSourceFileAction(CompilerInstance &c1) { - CompilerInstance &ci = this->instance(); - std::string currentInputPath{GetCurrentFileOrBufferName()}; - Fortran::parser::Options parserOptions = ci.invocation().fortranOpts(); - - // Prescan. In case of failure, report and return. - ci.parsing().Prescan(currentInputPath, parserOptions); - - return !reportFatalScanningErrors(this); -} - -bool PrescanAndParseAction::BeginSourceFileAction(CompilerInstance &c1) { - CompilerInstance &ci = this->instance(); - - std::string currentInputPath{GetCurrentFileOrBufferName()}; - - Fortran::parser::Options parserOptions = ci.invocation().fortranOpts(); - - if (ci.invocation().frontendOpts().fortranForm == FortranForm::Unknown) { - // Switch between fixed and free form format based on the input file - // extension. - // - // Ideally we should have all Fortran options set before entering this - // method (i.e. before processing any specific input files). However, we - // can't decide between fixed and free form based on the file extension - // earlier than this. - parserOptions.isFixedForm = currentInput().IsFixedForm(); - } - - // Prescan. In case of failure, report and return. - ci.parsing().Prescan(currentInputPath, parserOptions); - - if (reportFatalScanningErrors(this)) - return false; - - // Parse. In case of failure, report and return. - ci.parsing().Parse(llvm::outs()); - - if (reportFatalParsingErrors(this)) - return false; - - // Report the diagnostics from parsing - ci.parsing().messages().Emit(llvm::errs(), ci.allCookedSources()); +//===----------------------------------------------------------------------===// +// Custom BeginSourceFileAction +//===----------------------------------------------------------------------===// +bool PrescanAction::BeginSourceFileAction() { return RunPrescan(); } - return true; +bool PrescanAndParseAction::BeginSourceFileAction() { + return RunPrescan() && RunParse(); } -bool PrescanAndSemaAction::BeginSourceFileAction(CompilerInstance &c1) { - CompilerInstance &ci = this->instance(); - std::string currentInputPath{GetCurrentFileOrBufferName()}; - Fortran::parser::Options parserOptions = ci.invocation().fortranOpts(); - - // Prescan. In case of failure, report and return. - ci.parsing().Prescan(currentInputPath, parserOptions); - - if (reportFatalScanningErrors(this)) - return false; - - // Parse. In case of failure, report and return. - ci.parsing().Parse(llvm::outs()); - - if (reportFatalParsingErrors(this)) - return false; - - // Report the diagnostics from parsing - ci.parsing().messages().Emit(llvm::errs(), ci.allCookedSources()); - - auto &parseTree{*ci.parsing().parseTree()}; - - // Prepare semantics - ci.setSemantics(std::make_unique( - ci.invocation().semanticsContext(), parseTree, - ci.invocation().debugModuleDir())); - auto &semantics = ci.semantics(); - - // Run semantic checks - semantics.Perform(); - - // Report the diagnostics from the semantic checks - semantics.EmitMessages(ci.semaOutputStream()); - - return true; +bool PrescanAndSemaAction::BeginSourceFileAction() { + return RunPrescan() & RunParse() && RunSemanticChecks(); } +//===----------------------------------------------------------------------===// +// Custom ExecuteAction +//===----------------------------------------------------------------------===// void InputOutputTestAction::ExecuteAction() { CompilerInstance &ci = instance(); @@ -224,10 +114,6 @@ void DebugDumpProvenanceAction::ExecuteAction() { } void ParseSyntaxOnlyAction::ExecuteAction() { - CompilerInstance &ci = this->instance(); - - reportFatalSemanticErrors( - ci.semantics(), ci.diagnostics(), GetCurrentFileOrBufferName()); } void DebugUnparseNoSemaAction::ExecuteAction() { @@ -258,20 +144,17 @@ void DebugUnparseAction::ExecuteAction() { invoc.useAnalyzedObjectsForUnparse() ? &invoc.asFortran() : nullptr); // Report fatal semantic errors - reportFatalSemanticErrors(ci.semantics(), this->instance().diagnostics(), - GetCurrentFileOrBufferName()); + reportFatalSemanticErrors(); } void DebugUnparseWithSymbolsAction::ExecuteAction() { - CompilerInstance &ci = this->instance(); auto &parseTree{*instance().parsing().parseTree()}; Fortran::semantics::UnparseWithSymbols( llvm::outs(), parseTree, /*encoding=*/Fortran::parser::Encoding::UTF_8); // Report fatal semantic errors - reportFatalSemanticErrors( - ci.semantics(), ci.diagnostics(), GetCurrentFileOrBufferName()); + reportFatalSemanticErrors(); } void DebugDumpSymbolsAction::ExecuteAction() { @@ -283,8 +166,7 @@ void DebugDumpSymbolsAction::ExecuteAction() { // The runtime derived type information table builder may find and report // semantic errors. So it is important that we report them _after_ // BuildRuntimeDerivedTypeTables is run. - reportFatalSemanticErrors( - semantics, this->instance().diagnostics(), GetCurrentFileOrBufferName()); + reportFatalSemanticErrors(); if (!tables.schemata) { unsigned DiagID = @@ -315,8 +197,7 @@ void DebugDumpAllAction::ExecuteAction() { // The runtime derived type information table builder may find and report // semantic errors. So it is important that we report them _after_ // BuildRuntimeDerivedTypeTables is run. - reportFatalSemanticErrors( - semantics, this->instance().diagnostics(), GetCurrentFileOrBufferName()); + reportFatalSemanticErrors(); if (!tables.schemata) { unsigned DiagID = @@ -342,7 +223,6 @@ void DebugDumpParseTreeNoSemaAction::ExecuteAction() { } void DebugDumpParseTreeAction::ExecuteAction() { - CompilerInstance &ci = this->instance(); auto &parseTree{instance().parsing().parseTree()}; // Dump parse tree @@ -350,8 +230,7 @@ void DebugDumpParseTreeAction::ExecuteAction() { llvm::outs(), parseTree, &this->instance().invocation().asFortran()); // Report fatal semantic errors - reportFatalSemanticErrors( - ci.semantics(), ci.diagnostics(), GetCurrentFileOrBufferName()); + reportFatalSemanticErrors(); } void DebugMeasureParseTreeAction::ExecuteAction() { @@ -388,8 +267,7 @@ void DebugMeasureParseTreeAction::ExecuteAction() { void DebugPreFIRTreeAction::ExecuteAction() { CompilerInstance &ci = this->instance(); // Report and exit if fatal semantic errors are present - if (reportFatalSemanticErrors( - ci.semantics(), ci.diagnostics(), GetCurrentFileOrBufferName())) { + if (reportFatalSemanticErrors()) { return; } @@ -417,9 +295,9 @@ void GetDefinitionAction::ExecuteAction() { CompilerInstance &ci = this->instance(); // Report and exit if fatal semantic errors are present - if (reportFatalSemanticErrors( - ci.semantics(), ci.diagnostics(), GetCurrentFileOrBufferName())) + if (reportFatalSemanticErrors()) { return; + } parser::AllCookedSources &cs = ci.allCookedSources(); unsigned diagID = ci.diagnostics().getCustomDiagID( @@ -465,9 +343,9 @@ void GetSymbolsSourcesAction::ExecuteAction() { CompilerInstance &ci = this->instance(); // Report and exit if fatal semantic errors are present - if (reportFatalSemanticErrors( - ci.semantics(), ci.diagnostics(), GetCurrentFileOrBufferName())) + if (reportFatalSemanticErrors()) { return; + } ci.semantics().DumpSymbolsSources(llvm::outs()); } From ebdb0d09a4f464417e2d916be550ab227f1b3549 Mon Sep 17 00:00:00 2001 From: David Stuttard Date: Wed, 5 Feb 2020 14:22:18 +0000 Subject: [PATCH 188/700] AMDGPU: During img instruction ret value construction cater for non int values Make sure return type is int type. Differential Revision: https://reviews.llvm.org/D108131 Change-Id: Ic02f07d1234cd51b6ed78c3fecd2cb1d6acd5644 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b16cf1f3bed3e..9a81ada5830d9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5948,6 +5948,9 @@ static SDValue constructRetValue(SelectionDAG &DAG, EVT LegalReqRetVT = ReqRetVT; if (!ReqRetVT.isVector()) { + if (!Data.getValueType().isInteger()) + Data = DAG.getNode(ISD::BITCAST, DL, + Data.getValueType().changeTypeToInteger(), Data); Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data); } else { // We need to widen the return vector to a legal type From bcec4ccd04ae678a0d17b8fe8170e04221bf1959 Mon Sep 17 00:00:00 2001 From: Bing1 Yu Date: Thu, 5 Aug 2021 17:01:21 +0800 Subject: [PATCH 189/700] [X86] [AMX] Replace bitcast with specific AMX intrinsics with X86 specific cast. There is some discussion on the bitcast for vector and x86_amx at https://reviews.llvm.org/D99152. This patch is to introduce a x86 specific cast for vector and x86_amx, so that it can avoid some unnecessary optimization by middle-end. On the other way, we have to optimize the x86 specific cast by ourselves. This patch also optimize the cast operation to eliminate redundant code. Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D107544 --- llvm/include/llvm/IR/IntrinsicsX86.td | 4 + llvm/lib/Target/X86/X86LowerAMXType.cpp | 505 ++++++++++++++++-- llvm/test/CodeGen/X86/AMX/amx-type.ll | 101 ++-- .../X86/AMX/lat-combine-amx-bitcast.ll | 412 ++++++++++++++ .../X86/AMX/lat-transform-amx-bitcast.ll | 429 +++++++++++++++ 5 files changed, 1350 insertions(+), 101 deletions(-) create mode 100644 llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll create mode 100644 llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index ae0a416175f9e..eba83493e686d 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5093,6 +5093,10 @@ let TargetPrefix = "x86" in { [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_x86amx_ty, llvm_x86amx_ty], []>; + def int_x86_cast_vector_to_tile: + Intrinsic<[llvm_x86amx_ty], [llvm_anyvector_ty], [IntrNoMem]>; + def int_x86_cast_tile_to_vector: + Intrinsic<[llvm_anyvector_ty], [llvm_x86amx_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index 4ba44ccb6c160..a2bcc98f3d5b6 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -40,8 +40,10 @@ // #include "X86.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -56,66 +58,44 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/AssumeBundleBuilder.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace PatternMatch; #define DEBUG_TYPE "lower-amx-type" -static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, - BasicBlock *BB) { +static bool isAMXCast(Instruction *II) { + return match(II, + m_Intrinsic(m_Value())) || + match(II, m_Intrinsic(m_Value())); +} + +static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB, + Type *Ty) { Function &F = *BB->getParent(); Module *M = BB->getModule(); const DataLayout &DL = M->getDataLayout(); - Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false); LLVMContext &Ctx = Builder.getContext(); auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx)); unsigned AllocaAS = DL.getAllocaAddrSpace(); AllocaInst *AllocaRes = - new AllocaInst(V256I32Ty, AllocaAS, "", &F.getEntryBlock().front()); + new AllocaInst(Ty, AllocaAS, "", &F.getEntryBlock().front()); AllocaRes->setAlignment(AllocaAlignment); return AllocaRes; } -namespace { -class X86LowerAMXType { - Function &Func; - TargetMachine *TM = nullptr; - - // In AMX intrinsics we let Shape = {Row, Col}, but the - // RealCol = Col / ElementSize. We may use the RealCol - // as a new Row for other new created AMX intrinsics. - std::map Col2Row; - -public: - X86LowerAMXType(Function &F, TargetMachine *TargetM) : Func(F), TM(TargetM) {} - bool visit(); - void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast); - void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST); - bool transformBitcast(BitCastInst *Bitcast); - std::pair getShape(IntrinsicInst *II, unsigned OpNo); - Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity); -}; - -Value *X86LowerAMXType::getRowFromCol(Instruction *II, Value *V, - unsigned Granularity) { - if (Col2Row.count(V)) - return Col2Row[V]; - IRBuilder<> Builder(&*II->getParent()->getFirstInsertionPt()); - if (auto *I = dyn_cast(V)) { - BasicBlock::iterator Iter = I->getIterator(); - ++Iter; - Builder.SetInsertPoint(&*Iter); - } - ConstantInt *Gran = Builder.getInt16(Granularity); - Value *RealRow = Builder.CreateUDiv(V, Gran); - Col2Row[V] = RealRow; - return RealRow; +static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) { + for (Instruction &I : F.getEntryBlock()) + if (!isa(&I)) + return &I; + llvm_unreachable("No terminator in the entry block!"); } -std::pair X86LowerAMXType::getShape(IntrinsicInst *II, - unsigned OpNo) { +static std::pair getShape(IntrinsicInst *II, unsigned OpNo) { + IRBuilder<> Builder(II); Value *Row = nullptr, *Col = nullptr; switch (II->getIntrinsicID()) { default: @@ -144,14 +124,32 @@ std::pair X86LowerAMXType::getShape(IntrinsicInst *II, Col = II->getArgOperand(2); break; case 5: - Row = II->getArgOperand(2); - // FIXME: There is a design bug for AMX shape, which the Col should be - // Col/4 if it will be used as Row, but current Greedy RA can't handle - // this case well, it may failed if we generate a new Shape definition. - // So Let's just do it in O0 first. - // Row = Row / 4 - if (TM->getOptLevel() == CodeGenOpt::None) - Row = getRowFromCol(II, Row, 4); + if (isa(II->getArgOperand(2))) + Row = Builder.getInt16( + (dyn_cast(II->getOperand(2))->getSExtValue()) / 4); + else if (isa(II->getArgOperand(2))) { + // When it is not a const value and it is not a function argument, we + // create Row after the definition of II->getOperand(2) instead of + // before II. For example, II is %118, we try to getshape for %117: + // %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x + // i32> %115). + // %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 + // %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx + // %117). + // If we create %row = udiv i16 %106, 4 before %118(aka. II), then its + // definition is after its user(new tileload for %117). + // So, the best choice is to create %row right after the definition of + // %106. + Builder.SetInsertPoint(cast(II->getOperand(2))); + Row = Builder.CreateUDiv(II->getOperand(2), Builder.getInt16(4)); + cast(Row)->moveAfter(cast(II->getOperand(2))); + } else { + // When it is not a const value and it is a function argument, we create + // Row at the entry bb. + IRBuilder<> NewBuilder( + getFirstNonAllocaInTheEntryBlock(*II->getFunction())); + Row = NewBuilder.CreateUDiv(II->getOperand(2), NewBuilder.getInt16(4)); + } Col = II->getArgOperand(1); break; } @@ -162,6 +160,40 @@ std::pair X86LowerAMXType::getShape(IntrinsicInst *II, return std::make_pair(Row, Col); } +namespace { +class X86LowerAMXType { + Function &Func; + + // In AMX intrinsics we let Shape = {Row, Col}, but the + // RealCol = Col / ElementSize. We may use the RealCol + // as a new Row for other new created AMX intrinsics. + std::map Col2Row; + +public: + X86LowerAMXType(Function &F) : Func(F) {} + bool visit(); + void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast); + void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST); + bool transformBitcast(BitCastInst *Bitcast); + Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity); +}; + +Value *X86LowerAMXType::getRowFromCol(Instruction *II, Value *V, + unsigned Granularity) { + if (Col2Row.count(V)) + return Col2Row[V]; + IRBuilder<> Builder(&*II->getParent()->getFirstInsertionPt()); + if (auto *I = dyn_cast(V)) { + BasicBlock::iterator Iter = I->getIterator(); + ++Iter; + Builder.SetInsertPoint(&*Iter); + } + ConstantInt *Gran = Builder.getInt16(Granularity); + Value *RealRow = Builder.CreateUDiv(V, Gran); + Col2Row[V] = RealRow; + return RealRow; +} + // %src = load <256 x i32>, <256 x i32>* %addr, align 64 // %2 = bitcast <256 x i32> %src to x86_amx // --> @@ -230,8 +262,8 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { Value *I8Ptr, *Stride; auto *Src = Bitcast->getOperand(0); - auto Prepare = [&]() { - AllocaAddr = createAllocaInstAtEntry(Builder, Bitcast->getParent()); + auto Prepare = [&](Type *MemTy) { + AllocaAddr = createAllocaInstAtEntry(Builder, Bitcast->getParent(), MemTy); I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy()); Stride = Builder.getInt64(64); }; @@ -250,7 +282,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { auto *II = dyn_cast(U.getUser()); if (!II) return false; // May be bitcast from x86amx to <256 x i32>. - Prepare(); + Prepare(Bitcast->getOperand(0)->getType()); Builder.CreateStore(Src, AllocaAddr); // TODO we can pick an constant operand for the shape. Value *Row = nullptr, *Col = nullptr; @@ -270,7 +302,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { auto *II = dyn_cast(Src); if (!II) return false; // May be bitcast from <256 x i32> to x86amx. - Prepare(); + Prepare(Bitcast->getType()); Value *Row = II->getOperand(0); Value *Col = II->getOperand(1); std::array Args = {Row, Col, I8Ptr, Stride, Src}; @@ -637,6 +669,364 @@ bool X86VolatileTileData::volatileTileData() { namespace { +class X86LowerAMXCast { + Function &Func; + +public: + X86LowerAMXCast(Function &F) : Func(F) {} + bool combineAMXcast(TargetLibraryInfo *TLI); + bool transformAMXCast(IntrinsicInst *AMXCast); + bool transformAllAMXCast(); + bool optimizeAMXCastFromPhi(IntrinsicInst *CI, PHINode *PN, + SmallSetVector &DeadInst); +}; + +static bool DCEInstruction(Instruction *I, + SmallSetVector &WorkList, + const TargetLibraryInfo *TLI) { + if (isInstructionTriviallyDead(I, TLI)) { + salvageDebugInfo(*I); + salvageKnowledge(I); + + // Null out all of the instruction's operands to see if any operand becomes + // dead as we go. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + Value *OpV = I->getOperand(i); + I->setOperand(i, nullptr); + + if (!OpV->use_empty() || I == OpV) + continue; + + // If the operand is an instruction that became dead as we nulled out the + // operand, and if it is 'trivially' dead, delete it in a future loop + // iteration. + if (Instruction *OpI = dyn_cast(OpV)) { + if (isInstructionTriviallyDead(OpI, TLI)) { + WorkList.insert(OpI); + } + } + } + I->eraseFromParent(); + return true; + } + return false; +} + +/// This function handles following case +/// +/// A -> B amxcast +/// PHI +/// B -> A amxcast +/// +/// All the related PHI nodes can be replaced by new PHI nodes with type A. +/// The uses of \p CI can be changed to the new PHI node corresponding to \p PN. +bool X86LowerAMXCast::optimizeAMXCastFromPhi( + IntrinsicInst *CI, PHINode *PN, + SmallSetVector &DeadInst) { + IRBuilder<> Builder(CI); + Value *Src = CI->getOperand(0); + Type *SrcTy = Src->getType(); // Type B + Type *DestTy = CI->getType(); // Type A + + SmallVector PhiWorklist; + SmallSetVector OldPhiNodes; + + // Find all of the A->B casts and PHI nodes. + // We need to inspect all related PHI nodes, but PHIs can be cyclic, so + // OldPhiNodes is used to track all known PHI nodes, before adding a new + // PHI to PhiWorklist, it is checked against and added to OldPhiNodes first. + PhiWorklist.push_back(PN); + OldPhiNodes.insert(PN); + while (!PhiWorklist.empty()) { + auto *OldPN = PhiWorklist.pop_back_val(); + for (Value *IncValue : OldPN->incoming_values()) { + // TODO: currently, We ignore cases where it is a const. In the future, we + // might support const. + if (isa(IncValue)) + return false; + + if (auto *PNode = dyn_cast(IncValue)) { + if (OldPhiNodes.insert(PNode)) + PhiWorklist.push_back(PNode); + continue; + } + Instruction *ACI = dyn_cast(IncValue); + if (ACI && isAMXCast(ACI)) { + // Verify it's a A->B cast. + Type *TyA = ACI->getOperand(0)->getType(); + Type *TyB = ACI->getType(); + if (TyA != DestTy || TyB != SrcTy) + return false; + continue; + } + return false; + } + } + + // Check that each user of each old PHI node is something that we can + // rewrite, so that all of the old PHI nodes can be cleaned up afterwards. + for (auto *OldPN : OldPhiNodes) { + for (User *V : OldPN->users()) { + Instruction *ACI = dyn_cast(V); + if (ACI && isAMXCast(ACI)) { + // Verify it's a B->A cast. + Type *TyB = ACI->getOperand(0)->getType(); + Type *TyA = ACI->getType(); + if (TyA != DestTy || TyB != SrcTy) + return false; + } else if (auto *PHI = dyn_cast(V)) { + // As long as the user is another old PHI node, then even if we don't + // rewrite it, the PHI web we're considering won't have any users + // outside itself, so it'll be dead. + // example: + // bb.0: + // %0 = amxcast ... + // bb.1: + // %1 = amxcast ... + // bb.2: + // %goodphi = phi %0, %1 + // %3 = amxcast %goodphi + // bb.3: + // %goodphi2 = phi %0, %goodphi + // %4 = amxcast %goodphi2 + // When optimizeAMXCastFromPhi process %3 and %goodphi, %goodphi2 is + // outside the phi-web, so the combination stop When + // optimizeAMXCastFromPhi process %4 and %goodphi2, the optimization + // will be done. + if (OldPhiNodes.count(PHI) == 0) + return false; + } else + return false; + } + } + + // For each old PHI node, create a corresponding new PHI node with a type A. + SmallDenseMap NewPNodes; + for (auto *OldPN : OldPhiNodes) { + Builder.SetInsertPoint(OldPN); + PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands()); + NewPNodes[OldPN] = NewPN; + } + + // Fill in the operands of new PHI nodes. + for (auto *OldPN : OldPhiNodes) { + PHINode *NewPN = NewPNodes[OldPN]; + for (unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) { + Value *V = OldPN->getOperand(j); + Value *NewV = nullptr; + Instruction *ACI = dyn_cast(V); + // There should not be a AMXcast from a const. + if (ACI && isAMXCast(ACI)) + NewV = ACI->getOperand(0); + else if (auto *PrevPN = dyn_cast(V)) + NewV = NewPNodes[PrevPN]; + assert(NewV); + NewPN->addIncoming(NewV, OldPN->getIncomingBlock(j)); + } + } + + // Traverse all accumulated PHI nodes and process its users, + // which are Stores and BitcCasts. Without this processing + // NewPHI nodes could be replicated and could lead to extra + // moves generated after DeSSA. + // If there is a store with type B, change it to type A. + + // Replace users of BitCast B->A with NewPHI. These will help + // later to get rid of a closure formed by OldPHI nodes. + for (auto *OldPN : OldPhiNodes) { + PHINode *NewPN = NewPNodes[OldPN]; + for (User *V : make_early_inc_range(OldPN->users())) { + Instruction *ACI = dyn_cast(V); + if (ACI && isAMXCast(ACI)) { + Type *TyB = ACI->getOperand(0)->getType(); + Type *TyA = ACI->getType(); + assert(TyA == DestTy && TyB == SrcTy); + (void)TyA; + (void)TyB; + ACI->replaceAllUsesWith(NewPN); + DeadInst.insert(ACI); + } else if (auto *PHI = dyn_cast(V)) { + // We don't need to push PHINode into DeadInst since they are operands + // of rootPN DCE can safely delete rootPN's operands if rootPN is dead. + assert(OldPhiNodes.contains(PHI)); + (void)PHI; + } else + llvm_unreachable("all uses should be handled"); + } + } + return true; +} + +bool X86LowerAMXCast::combineAMXcast(TargetLibraryInfo *TLI) { + bool Change = false; + // Collect tile cast instruction. + SmallVector Vec2TileInsts; + SmallVector Tile2VecInsts; + SmallVector PhiCastWorkList; + SmallSetVector DeadInst; + for (BasicBlock &BB : Func) { + for (Instruction &I : BB) { + Value *Vec; + if (match(&I, + m_Intrinsic(m_Value(Vec)))) + Vec2TileInsts.push_back(&I); + else if (match(&I, m_Intrinsic( + m_Value(Vec)))) + Tile2VecInsts.push_back(&I); + } + } + + auto Convert = [&](SmallVectorImpl &Insts, Intrinsic::ID IID) { + for (auto *Inst : Insts) { + for (User *U : Inst->users()) { + IntrinsicInst *II = dyn_cast(U); + if (!II || II->getIntrinsicID() != IID) + continue; + // T1 = vec2tile V0 + // V2 = tile2vec T1 + // V3 = OP V2 + // --> + // T1 = vec2tile V0 + // V2 = tile2vec T1 + // V3 = OP V0 + II->replaceAllUsesWith(Inst->getOperand(0)); + Change = true; + } + } + }; + + Convert(Vec2TileInsts, Intrinsic::x86_cast_tile_to_vector); + Convert(Tile2VecInsts, Intrinsic::x86_cast_vector_to_tile); + + auto EraseInst = [](SmallVectorImpl &Insts) { + for (auto *Inst : Insts) { + if (Inst->use_empty()) + Inst->eraseFromParent(); + } + }; + + EraseInst(Vec2TileInsts); + EraseInst(Tile2VecInsts); + + // Handle the A->B->A cast, and there is an intervening PHI node. + for (BasicBlock &BB : Func) { + for (Instruction &I : BB) { + if (isAMXCast(&I)) { + if (PHINode *PN = dyn_cast(I.getOperand(0))) + PhiCastWorkList.push_back(&I); + } + } + } + for (auto *I : PhiCastWorkList) { + // We skip the dead Amxcast. + if (DeadInst.contains(I)) + continue; + PHINode *PN = cast(I->getOperand(0)); + if (optimizeAMXCastFromPhi(cast(I), PN, DeadInst)) { + DeadInst.insert(PN); + Change = true; + } + } + + // Since we create new phi and merge AMXCast, some old phis and AMXCast might + // have no uses. We do some DeadCodeElimination for them. + while (!DeadInst.empty()) { + Instruction *I = DeadInst.pop_back_val(); + Change |= DCEInstruction(I, DeadInst, TLI); + } + return Change; +} + +// There might be remaining AMXcast after combineAMXcast and they should be +// handled elegantly. +bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) { + IRBuilder<> Builder(AMXCast); + AllocaInst *AllocaAddr; + Value *I8Ptr, *Stride; + auto *Src = AMXCast->getOperand(0); + + auto Prepare = [&](Type *MemTy) { + AllocaAddr = createAllocaInstAtEntry(Builder, AMXCast->getParent(), MemTy); + I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy()); + Stride = Builder.getInt64(64); + }; + + if (AMXCast->getType()->isX86_AMXTy()) { + // %2 = amxcast <225 x i32> %src to x86_amx + // call void @llvm.x86.tilestored64.internal(i16 15, i16 60, + // i8* %addr3, i64 60, x86_amx %2) + // --> + // %addr = alloca <225 x i32>, align 64 + // store <225 x i32> %src, <225 x i32>* %addr, align 64 + // %addr2 = bitcast <225 x i32>* %addr to i8* + // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 15, i16 60, + // i8* %addr2, + // i64 60) + // call void @llvm.x86.tilestored64.internal(i16 15, i16 60, + // i8* %addr3, i64 60, x86_amx %2) + Use &U = *(AMXCast->use_begin()); + unsigned OpNo = U.getOperandNo(); + auto *II = dyn_cast(U.getUser()); + if (!II) + return false; // May be bitcast from x86amx to <256 x i32>. + Prepare(AMXCast->getOperand(0)->getType()); + Builder.CreateStore(Src, AllocaAddr); + // TODO we can pick an constant operand for the shape. + Value *Row = nullptr, *Col = nullptr; + std::tie(Row, Col) = getShape(II, OpNo); + std::array Args = { + Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())}; + Value *NewInst = Builder.CreateIntrinsic( + Intrinsic::x86_tileloadd64_internal, None, Args); + AMXCast->replaceAllUsesWith(NewInst); + AMXCast->eraseFromParent(); + } else { + // %2 = amxcast x86_amx %src to <225 x i32> + // --> + // %addr = alloca <225 x i32>, align 64 + // %addr2 = bitcast <225 x i32>* to i8* + // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, + // i8* %addr2, i64 %stride) + // %2 = load <225 x i32>, <225 x i32>* %addr, align 64 + auto *II = dyn_cast(Src); + if (!II) + return false; // May be bitcast from <256 x i32> to x86amx. + Prepare(AMXCast->getType()); + Value *Row = II->getOperand(0); + Value *Col = II->getOperand(1); + std::array Args = { + Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty()), Src}; + Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args); + Value *NewInst = Builder.CreateLoad(AMXCast->getType(), AllocaAddr); + AMXCast->replaceAllUsesWith(NewInst); + AMXCast->eraseFromParent(); + } + + return true; +} + +bool X86LowerAMXCast::transformAllAMXCast() { + bool Change = false; + // Collect tile cast instruction. + SmallVector WorkLists; + for (BasicBlock &BB : Func) { + for (Instruction &I : BB) { + if (isAMXCast(&I)) + WorkLists.push_back(&I); + } + } + + for (auto *Inst : WorkLists) { + Change |= transformAMXCast(cast(Inst)); + } + + return Change; +} + +} // anonymous namespace + +namespace { + class X86LowerAMXTypeLegacyPass : public FunctionPass { public: static char ID; @@ -647,8 +1037,15 @@ class X86LowerAMXTypeLegacyPass : public FunctionPass { bool runOnFunction(Function &F) override { TargetMachine *TM = &getAnalysis().getTM(); + TargetLibraryInfo *TLI = + &getAnalysis().getTLI(F); + X86LowerAMXCast LAC(F); + LAC.combineAMXcast(TLI); + // There might be remaining AMXcast after combineAMXcast and they should be + // handled elegantly. + LAC.transformAllAMXCast(); - X86LowerAMXType LAT(F, TM); + X86LowerAMXType LAT(F); bool C = LAT.visit(); // Prepare for fast register allocation at O0. @@ -671,6 +1068,7 @@ class X86LowerAMXTypeLegacyPass : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired(); + AU.addRequired(); } }; @@ -681,6 +1079,7 @@ char X86LowerAMXTypeLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false, false) diff --git a/llvm/test/CodeGen/X86/AMX/amx-type.ll b/llvm/test/CodeGen/X86/AMX/amx-type.ll index 989a1076ce7a6..ddf650525baaa 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-type.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-type.ll @@ -163,18 +163,19 @@ define dso_local void @__tile_dpbssd(%struct.__tile_str* nocapture %0, %struct._ ; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[TMP6]], align 2 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP8]], align 2 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <256 x i32>* [[TMP10]] to i8* -; CHECK-NEXT: [[TMP12:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP11]], i64 64) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 2 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <256 x i32>* [[TMP13]] to i8* -; CHECK-NEXT: [[TMP15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP9]], i8* [[TMP14]], i64 64) -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <256 x i32>* [[TMP16]] to i8* -; CHECK-NEXT: [[TMP18:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP7]], i8* [[TMP17]], i64 64) -; CHECK-NEXT: [[TMP19:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP5]], i16 [[TMP7]], i16 [[TMP9]], x86_amx [[TMP12]], x86_amx [[TMP15]], x86_amx [[TMP18]]) -; CHECK-NEXT: [[TMP20:%.*]] = bitcast <256 x i32>* [[TMP10]] to i8* -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP20]], i64 64, x86_amx [[TMP19]]) +; CHECK-NEXT: [[TMP10:%.*]] = udiv i16 [[TMP9]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP11]] to i8* +; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP12]], i64 64) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <256 x i32>* [[TMP14]] to i8* +; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP9]], i8* [[TMP15]], i64 64) +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <256 x i32>* [[TMP17]] to i8* +; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP10]], i16 [[TMP7]], i8* [[TMP18]], i64 64) +; CHECK-NEXT: [[TMP20:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP5]], i16 [[TMP7]], i16 [[TMP9]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <256 x i32>* [[TMP11]] to i8* +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP21]], i64 64, x86_amx [[TMP20]]) ; CHECK-NEXT: ret void ; %4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 0 @@ -200,15 +201,16 @@ define dso_local void @__tile_dpbssd(%struct.__tile_str* nocapture %0, %struct._ define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) { ; CHECK-LABEL: @__tile_dpbsud( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8* -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K:%.*]], i8* [[TMP1]], i64 64) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8* -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[K]], i16 [[N:%.*]], i8* [[TMP3]], i64 64) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8* -; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP5]], i64 64) -; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP6]], x86_amx [[TMP2]], x86_amx [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <256 x i32>* [[PC]] to i8* -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP7]], i64 64, x86_amx [[T6]]) +; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8* +; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP2]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8* +; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], i8* [[TMP4]], i64 64) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8* +; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP6]], i64 64) +; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <256 x i32>* [[PC]] to i8* +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP8]], i64 64, x86_amx [[T6]]) ; CHECK-NEXT: ret void ; %t0 = load <256 x i32>, <256 x i32>* %pa, align 64 @@ -225,15 +227,16 @@ define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, < define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) { ; CHECK-LABEL: @__tile_dpbusd( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8* -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K:%.*]], i8* [[TMP1]], i64 64) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8* -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[K]], i16 [[N:%.*]], i8* [[TMP3]], i64 64) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8* -; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP5]], i64 64) -; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP6]], x86_amx [[TMP2]], x86_amx [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <256 x i32>* [[PC]] to i8* -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP7]], i64 64, x86_amx [[T6]]) +; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8* +; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP2]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8* +; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], i8* [[TMP4]], i64 64) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8* +; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP6]], i64 64) +; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <256 x i32>* [[PC]] to i8* +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP8]], i64 64, x86_amx [[T6]]) ; CHECK-NEXT: ret void ; %t0 = load <256 x i32>, <256 x i32>* %pa, align 64 @@ -250,15 +253,16 @@ define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, < define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) { ; CHECK-LABEL: @__tile_dpbuud( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8* -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K:%.*]], i8* [[TMP1]], i64 64) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8* -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[K]], i16 [[N:%.*]], i8* [[TMP3]], i64 64) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8* -; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP5]], i64 64) -; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP6]], x86_amx [[TMP2]], x86_amx [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <256 x i32>* [[PC]] to i8* -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP7]], i64 64, x86_amx [[T6]]) +; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8* +; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP2]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8* +; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], i8* [[TMP4]], i64 64) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8* +; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP6]], i64 64) +; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <256 x i32>* [[PC]] to i8* +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP8]], i64 64, x86_amx [[T6]]) ; CHECK-NEXT: ret void ; %t0 = load <256 x i32>, <256 x i32>* %pa, align 64 @@ -275,15 +279,16 @@ define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, < define dso_local void @__tile_dpbf16ps(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) { ; CHECK-LABEL: @__tile_dpbf16ps( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8* -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K:%.*]], i8* [[TMP1]], i64 64) -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8* -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[K]], i16 [[N:%.*]], i8* [[TMP3]], i64 64) -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8* -; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP5]], i64 64) -; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP6]], x86_amx [[TMP2]], x86_amx [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = bitcast <256 x i32>* [[PC]] to i8* -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP7]], i64 64, x86_amx [[T6]]) +; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8* +; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP2]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8* +; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], i8* [[TMP4]], i64 64) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8* +; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP6]], i64 64) +; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <256 x i32>* [[PC]] to i8* +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP8]], i64 64, x86_amx [[T6]]) ; CHECK-NEXT: ret void ; %t0 = load <256 x i32>, <256 x i32>* %pa, align 64 diff --git a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll new file mode 100644 index 0000000000000..4aa5c7e3e1b9a --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll @@ -0,0 +1,412 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s + +define void @combine_amx_cast_inside_bb() { +; CHECK-LABEL: @combine_amx_cast_inside_bb( +; CHECK-NEXT: wrapper_entry: +; CHECK-NEXT: [[TMP0:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP0]]) +; CHECK-NEXT: ret void +; +wrapper_entry: + %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef) + %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) + %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %tmp) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %1) + ret void +} + +; Cases where amxcast can be combined across bb +; %5 and %6 is combined together since %goodphi's incoming is phi or amxcast +define void @combine_amx_cast_and_phi() { +; CHECK-LABEL: @combine_amx_cast_and_phi( +; CHECK-NEXT: wrapper_entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <616 x i8>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef) +; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] +; CHECK: for.body.i.lr.ph.i: +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <110 x i32>* [[TMP2]] to i8* +; CHECK-NEXT: store <110 x i32> undef, <110 x i32>* [[TMP2]], align 512 +; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP4]], i64 40) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <616 x i8>* [[TMP1]] to i8* +; CHECK-NEXT: store <616 x i8> undef, <616 x i8>* [[TMP1]], align 1024 +; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* [[TMP6]], i64 56) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <560 x i8>* [[TMP0]] to i8* +; CHECK-NEXT: store <560 x i8> undef, <560 x i8>* [[TMP0]], align 1024 +; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP8]], i64 40) +; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]]) +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_I_I]] +; CHECK: for.cond.cleanup.i.i: +; CHECK-NEXT: [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP11]]) +; CHECK-NEXT: ret void +; +wrapper_entry: + %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef) + %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) + br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i + +for.body.i.lr.ph.i: ; preds = %wrapper_entry + %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) + %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) + %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) + %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) + br label %for.cond.cleanup.i.i + +for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry + %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ] + %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %6) + ret void +} + +; Cases where amxcast can't be combined across bb +; %5 and %6 is not combined together since %evilphi's incoming is not phi or amxcast +define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) { +; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi( +; CHECK-NEXT: wrapper_entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = alloca <560 x i8>, align 64 +; CHECK-NEXT: [[TMP3:%.*]] = alloca <616 x i8>, align 64 +; CHECK-NEXT: [[TMP4:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP5:%.*]] = add <110 x i32> [[TMP:%.*]], [[TMP]] +; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] +; CHECK: for.body.i.lr.ph.i: +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <110 x i32>* [[TMP4]] to i8* +; CHECK-NEXT: store <110 x i32> undef, <110 x i32>* [[TMP4]], align 512 +; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP6]], i64 40) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <616 x i8>* [[TMP3]] to i8* +; CHECK-NEXT: store <616 x i8> undef, <616 x i8>* [[TMP3]], align 1024 +; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* [[TMP8]], i64 56) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <560 x i8>* [[TMP2]] to i8* +; CHECK-NEXT: store <560 x i8> undef, <560 x i8>* [[TMP2]], align 1024 +; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP10]], i64 40) +; CHECK-NEXT: [[TMP12:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP7]], x86_amx [[TMP9]], x86_amx [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <110 x i32>* [[TMP1]] to i8* +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* [[TMP13]], i64 40, x86_amx [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = load <110 x i32>, <110 x i32>* [[TMP1]], align 512 +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_I_I]] +; CHECK: for.cond.cleanup.i.i: +; CHECK-NEXT: [[EVILPHI:%.*]] = phi <110 x i32> [ [[TMP5]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP14]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <110 x i32>* [[TMP0]] to i8* +; CHECK-NEXT: store <110 x i32> [[EVILPHI]], <110 x i32>* [[TMP0]], align 512 +; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP15]], i64 40) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP16]]) +; CHECK-NEXT: ret void +; +wrapper_entry: + %0 = add <110 x i32> %tmp, %tmp + br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i + +for.body.i.lr.ph.i: ; preds = %wrapper_entry + %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) + %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) + %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) + %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) + br label %for.cond.cleanup.i.i + +for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry + %evilphi = phi <110 x i32> [ %0, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ] + %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %6) + ret void +} + +; Cases where amxcast can't be combined across bb +; %5 and %6 is not combined together since %evilphi's user aka %evilphi2 is not inside phi web. +define void @fail_to_combine_amx_cast_and_phi2() { +; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi2( +; CHECK-NEXT: wrapper_entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = alloca <560 x i8>, align 64 +; CHECK-NEXT: [[TMP3:%.*]] = alloca <616 x i8>, align 64 +; CHECK-NEXT: [[TMP4:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP5:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <110 x i32>* [[TMP5]] to i8* +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* [[TMP7]], i64 40, x86_amx [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = load <110 x i32>, <110 x i32>* [[TMP5]], align 512 +; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] +; CHECK: for.body.i.lr.ph.i: +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <110 x i32>* [[TMP4]] to i8* +; CHECK-NEXT: store <110 x i32> undef, <110 x i32>* [[TMP4]], align 512 +; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP9]], i64 40) +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <616 x i8>* [[TMP3]] to i8* +; CHECK-NEXT: store <616 x i8> undef, <616 x i8>* [[TMP3]], align 1024 +; CHECK-NEXT: [[TMP12:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* [[TMP11]], i64 56) +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <560 x i8>* [[TMP2]] to i8* +; CHECK-NEXT: store <560 x i8> undef, <560 x i8>* [[TMP2]], align 1024 +; CHECK-NEXT: [[TMP14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP13]], i64 40) +; CHECK-NEXT: [[TMP15:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP10]], x86_amx [[TMP12]], x86_amx [[TMP14]]) +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <110 x i32>* [[TMP1]] to i8* +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* [[TMP16]], i64 40, x86_amx [[TMP15]]) +; CHECK-NEXT: [[TMP17:%.*]] = load <110 x i32>, <110 x i32>* [[TMP1]], align 512 +; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]] +; CHECK: for.cond.cleanup.i.i: +; CHECK-NEXT: [[GOODPHI:%.*]] = phi <110 x i32> [ [[TMP8]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <110 x i32>* [[TMP0]] to i8* +; CHECK-NEXT: store <110 x i32> [[GOODPHI]], <110 x i32>* [[TMP0]], align 512 +; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP18]], i64 40) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP19]]) +; CHECK-NEXT: br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]] +; CHECK: exit: +; CHECK-NEXT: [[EVILPHI2:%.*]] = phi <110 x i32> [ [[GOODPHI]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: store <110 x i32> [[EVILPHI2]], <110 x i32>* undef, align 512 +; CHECK-NEXT: ret void +; +wrapper_entry: + %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef) + %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) + br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i + +for.body.i.lr.ph.i: ; preds = %wrapper_entry + %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) + %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) + %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) + %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) + br i1 undef, label %for.cond.cleanup.i.i, label %exit + +for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry + %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ] + %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %6) + br i1 undef, label %exit, label %for.body.i.lr.ph.i +exit: + %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ] + store <110 x i32> %evilphi2, <110 x i32>* undef, align 512 + ret void +} + +define void @fail_to_combine_amx_cast_and_phi_due_to_const_value() { +; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi_due_to_const_value( +; CHECK-NEXT: wrapper_entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = alloca <560 x i8>, align 64 +; CHECK-NEXT: [[TMP3:%.*]] = alloca <616 x i8>, align 64 +; CHECK-NEXT: [[TMP4:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] +; CHECK: for.body.i.lr.ph.i: +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <110 x i32>* [[TMP4]] to i8* +; CHECK-NEXT: store <110 x i32> undef, <110 x i32>* [[TMP4]], align 512 +; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP5]], i64 40) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <616 x i8>* [[TMP3]] to i8* +; CHECK-NEXT: store <616 x i8> undef, <616 x i8>* [[TMP3]], align 1024 +; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* [[TMP7]], i64 56) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <560 x i8>* [[TMP2]] to i8* +; CHECK-NEXT: store <560 x i8> undef, <560 x i8>* [[TMP2]], align 1024 +; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP9]], i64 40) +; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP6]], x86_amx [[TMP8]], x86_amx [[TMP10]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <110 x i32>* [[TMP1]] to i8* +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* [[TMP12]], i64 40, x86_amx [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = load <110 x i32>, <110 x i32>* [[TMP1]], align 512 +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_I_I]] +; CHECK: for.cond.cleanup.i.i: +; CHECK-NEXT: [[EVILPHI:%.*]] = phi <110 x i32> [ undef, [[WRAPPER_ENTRY:%.*]] ], [ [[TMP13]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <110 x i32>* [[TMP0]] to i8* +; CHECK-NEXT: store <110 x i32> [[EVILPHI]], <110 x i32>* [[TMP0]], align 512 +; CHECK-NEXT: [[TMP15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP14]], i64 40) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP15]]) +; CHECK-NEXT: ret void +; +wrapper_entry: + br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i + +for.body.i.lr.ph.i: ; preds = %wrapper_entry + %0 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) + %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) + %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %3 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %0, x86_amx %1, x86_amx %2) + %4 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %3) + br label %for.cond.cleanup.i.i + +for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry + %evilphi = phi <110 x i32> [ undef, %wrapper_entry ], [ %4, %for.body.i.lr.ph.i ] + %5 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %5) + ret void +} + +; Cases where amxcast can be combined across bb +; When optimizeAMXCastFromPhi process %6 and %goodphi, %goodphi2 is outside the phi-web, so the optimization stop +; When optimizeAMXCastFromPhi process %7 and %goodphi2, the optimization continue. +define void @combine_amx_cast_and_multiple_phi() { +; CHECK-LABEL: @combine_amx_cast_and_multiple_phi( +; CHECK-NEXT: wrapper_entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <616 x i8>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef) +; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] +; CHECK: for.body.i.lr.ph.i: +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <110 x i32>* [[TMP2]] to i8* +; CHECK-NEXT: store <110 x i32> undef, <110 x i32>* [[TMP2]], align 512 +; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP4]], i64 40) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <616 x i8>* [[TMP1]] to i8* +; CHECK-NEXT: store <616 x i8> undef, <616 x i8>* [[TMP1]], align 1024 +; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* [[TMP6]], i64 56) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <560 x i8>* [[TMP0]] to i8* +; CHECK-NEXT: store <560 x i8> undef, <560 x i8>* [[TMP0]], align 1024 +; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP8]], i64 40) +; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]]) +; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]] +; CHECK: for.cond.cleanup.i.i: +; CHECK-NEXT: [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP11]]) +; CHECK-NEXT: br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]] +; CHECK: exit: +; CHECK-NEXT: [[TMP12:%.*]] = phi x86_amx [ [[TMP11]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP12]]) +; CHECK-NEXT: ret void +; +wrapper_entry: + %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef) + %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) + br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i + +for.body.i.lr.ph.i: ; preds = %wrapper_entry + %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) + %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) + %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) + %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) + br i1 undef, label %for.cond.cleanup.i.i, label %exit + +for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry + %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ] + %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %6) + br i1 undef, label %exit, label %for.body.i.lr.ph.i +exit: + %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ] + %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %7) + ret void +} + +; Currently we are not able to delete DeadPHICycle, later we will handle with them +define void @combine_amx_cast_and_phi_in_a_circle() { +; CHECK-LABEL: @combine_amx_cast_and_phi_in_a_circle( +; CHECK-NEXT: wrapper_entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <560 x i8>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = alloca <616 x i8>, align 64 +; CHECK-NEXT: [[TMP3:%.*]] = alloca <110 x i32>, align 64 +; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef) +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <110 x i32>* [[TMP3]] to i8* +; CHECK-NEXT: store <110 x i32> undef, <110 x i32>* [[TMP3]], align 512 +; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP5]], i64 40) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <616 x i8>* [[TMP2]] to i8* +; CHECK-NEXT: store <616 x i8> undef, <616 x i8>* [[TMP2]], align 1024 +; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* [[TMP7]], i64 56) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <560 x i8>* [[TMP1]] to i8* +; CHECK-NEXT: store <560 x i8> undef, <560 x i8>* [[TMP1]], align 1024 +; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP9]], i64 40) +; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP6]], x86_amx [[TMP8]], x86_amx [[TMP10]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <110 x i32>* [[TMP0]] to i8* +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* [[TMP12]], i64 40, x86_amx [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = load <110 x i32>, <110 x i32>* [[TMP0]], align 512 +; CHECK-NEXT: br i1 undef, label [[BB2:%.*]], label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP14:%.*]] = phi x86_amx [ [[TMP15:%.*]], [[BB3]] ], [ [[TMP11]], [[BB1]] ] +; CHECK-NEXT: [[GOODPHI:%.*]] = phi <110 x i32> [ [[EVILPHI2:%.*]], [[BB3]] ], [ [[TMP13]], [[BB1]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP14]]) +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: [[TMP15]] = phi x86_amx [ [[TMP14]], [[BB2]] ], [ [[TMP11]], [[BB1]] ] +; CHECK-NEXT: [[EVILPHI2]] = phi <110 x i32> [ [[GOODPHI]], [[BB2]] ], [ [[TMP13]], [[BB1]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP15]]) +; CHECK-NEXT: br i1 undef, label [[BB2]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP15]]) +; CHECK-NEXT: ret void +; +wrapper_entry: + %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef) + %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) + br label %bb1 + +bb1: ; preds = %wrapper_entry + %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) + %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) + %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) + %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) + br i1 undef, label %bb2, label %bb3 + +bb2: ; preds = %bb1, %wrapper_entry + %goodphi = phi <110 x i32> [ %evilphi2, %bb3], [ %5, %bb1 ] + %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %6) + br label %bb3 +bb3: + %evilphi2 = phi <110 x i32> [ %goodphi, %bb2 ], [ %5, %bb1 ] + %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %7) + br i1 undef, label %bb2, label %exit +exit: + %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %8) + ret void +} + +define void @eliminate_unused_phi_and_cast() { +; CHECK-LABEL: @eliminate_unused_phi_and_cast( +; CHECK-NEXT: wrapper_entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64 +; CHECK-NEXT: [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef) +; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] +; CHECK: for.body.i.lr.ph.i: +; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* undef, i64 undef) +; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* undef, i64 undef) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <560 x i8>* [[TMP0]] to i8* +; CHECK-NEXT: store <560 x i8> undef, <560 x i8>* [[TMP0]], align 1024 +; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP4]], i64 40) +; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP2]], x86_amx [[TMP3]], x86_amx [[TMP5]]) +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_I_I]] +; CHECK: for.cond.cleanup.i.i: +; CHECK-NEXT: [[TMP7:%.*]] = phi x86_amx [ [[TMP1]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP6]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP7]]) +; CHECK-NEXT: ret void +; +wrapper_entry: + %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef) + %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) + br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i + +for.body.i.lr.ph.i: ; preds = %wrapper_entry + %1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* undef, i64 undef) + %v1 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %1) + %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* undef, i64 undef) + %v2 = call <616 x i8> @llvm.x86.cast.tile.to.vector.v616i8(x86_amx %2) + %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %v1) + %4 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> %v2) + %5 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %6 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %3, x86_amx %4, x86_amx %5) + %7 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %6) + br label %for.cond.cleanup.i.i + +for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry + %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %7, %for.body.i.lr.ph.i ] + %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %8) + ret void +} + +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx) +declare <616 x i8> @llvm.x86.cast.tile.to.vector.v616i8(x86_amx) +declare x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32>) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) +declare x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8>) +declare x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8>) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll new file mode 100644 index 0000000000000..98a820197bbd6 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll @@ -0,0 +1,429 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s + +%struct.__tile_str = type { i16, i16, <256 x i32> } + +@buf = dso_local global [1024 x i8] zeroinitializer, align 64 +@buf2 = dso_local global [1024 x i8] zeroinitializer, align 64 + +; test bitcast x86_amx to <256 x i32> +define dso_local void @test_user_empty(i16 %m, i16 %n, i8 *%buf, i64 %s) { +; CHECK-LABEL: @test_user_empty( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], i8* [[BUF:%.*]], i64 [[S:%.*]]) +; CHECK-NEXT: ret void +; +entry: + %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, i8* %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + ret void +} + +; test bitcast <256 x i32> to x86_amx +define dso_local void @test_user_empty2(<256 x i32> %in) { +; CHECK-LABEL: @test_user_empty2( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + %t = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %in) + ret void +} + +define dso_local <256 x i32> @test_amx_load_bitcast_v256i32(<256 x i32>* %in, i16 %m, i16 %n, i8 *%buf, i64 %s) { +; CHECK-LABEL: @test_amx_load_bitcast_v256i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[T1:%.*]] = load <256 x i32>, <256 x i32>* [[IN:%.*]], align 64 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <256 x i32>* [[TMP0]] to i8* +; CHECK-NEXT: store <256 x i32> [[T1]], <256 x i32>* [[TMP0]], align 1024 +; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[N:%.*]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], i8* [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP3]]) +; CHECK-NEXT: ret <256 x i32> [[T1]] +; +entry: + %t1 = load <256 x i32>, <256 x i32>* %in, align 64 + %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1) + call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, i8* %buf, i64 %s, x86_amx %t2) + ret <256 x i32> %t1 +} + +define dso_local <225 x i32> @test_amx_load_bitcast_v225i32(<225 x i32>* %in, i16 %m, i16 %n, i8 *%buf, i64 %s) { +; CHECK-LABEL: @test_amx_load_bitcast_v225i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca <225 x i32>, align 64 +; CHECK-NEXT: [[T1:%.*]] = load <225 x i32>, <225 x i32>* [[IN:%.*]], align 64 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <225 x i32>* [[TMP0]] to i8* +; CHECK-NEXT: store <225 x i32> [[T1]], <225 x i32>* [[TMP0]], align 1024 +; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[N:%.*]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], i8* [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP3]]) +; CHECK-NEXT: ret <225 x i32> [[T1]] +; +entry: + %t1 = load <225 x i32>, <225 x i32>* %in, align 64 + %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32> %t1) + call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, i8* %buf, i64 %s, x86_amx %t2) + ret <225 x i32> %t1 +} + +define dso_local <256 x i32> @test_amx_bitcast_store(<256 x i32>* %out, i16 %m, i16 %n, i8 *%buf, i64 %s) { +; CHECK-LABEL: @test_amx_bitcast_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[M]], i8* [[BUF:%.*]], i64 [[S:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <256 x i32>* [[TMP0]] to i8* +; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[M]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], i8* [[TMP1]], i64 [[TMP2]], x86_amx [[T1]]) +; CHECK-NEXT: [[TMP3:%.*]] = load <256 x i32>, <256 x i32>* [[TMP0]], align 1024 +; CHECK-NEXT: store <256 x i32> [[TMP3]], <256 x i32>* [[OUT:%.*]], align 1024 +; CHECK-NEXT: ret <256 x i32> [[TMP3]] +; +entry: + %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %m, i8* %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + store <256 x i32> %t2, <256 x i32>* %out + ret <256 x i32> %t2 +} + +define dso_local void @test_src_add(<256 x i32> %x, <256 x i32> %y, i16 %r, i16 %c, i8* %buf, i64 %s) { +; CHECK-LABEL: @test_src_add( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <256 x i32>* [[TMP0]] to i8* +; CHECK-NEXT: store <256 x i32> [[ADD]], <256 x i32>* [[TMP0]], align 1024 +; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[C:%.*]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C]], i8* [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], i8* [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP3]]) +; CHECK-NEXT: ret void +; +entry: + %add = add <256 x i32> %y, %x + %t = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %add) + call void @llvm.x86.tilestored64.internal(i16 %r, i16 %c, i8* %buf, i64 %s, x86_amx %t) + ret void +} + +define dso_local void @test_src_add2(<256 x i32> %x, i16 %r, i16 %c, i8* %buf, i64 %s) { +; CHECK-LABEL: @test_src_add2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], i8* [[BUF:%.*]], i64 [[S:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <256 x i32>* [[TMP0]] to i8* +; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[C]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], i8* [[TMP1]], i64 [[TMP2]], x86_amx [[T1]]) +; CHECK-NEXT: [[TMP3:%.*]] = load <256 x i32>, <256 x i32>* [[TMP0]], align 1024 +; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[TMP3]], [[X:%.*]] +; CHECK-NEXT: ret void +; +entry: + %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %r, i16 %c, i8* %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %add = add <256 x i32> %t2, %x + ret void +} + +define dso_local void @__tile_loadd(%struct.__tile_str* nocapture %0, i8* %1, i64 %2) local_unnamed_addr { +; CHECK-LABEL: @__tile_loadd( +; CHECK-NEXT: [[TMP4:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP5]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0]], i64 0, i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP2:%.*]], 32 +; CHECK-NEXT: [[TMP10:%.*]] = ashr exact i64 [[TMP9]], 32 +; CHECK-NEXT: [[TMP11:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP6]], i16 [[TMP8]], i8* [[TMP1:%.*]], i64 [[TMP10]]) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8* +; CHECK-NEXT: [[TMP13:%.*]] = sext i16 [[TMP8]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP6]], i16 [[TMP8]], i8* [[TMP12]], i64 [[TMP13]], x86_amx [[TMP11]]) +; CHECK-NEXT: [[TMP14:%.*]] = load <256 x i32>, <256 x i32>* [[TMP4]], align 1024 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0]], i64 0, i32 2 +; CHECK-NEXT: store <256 x i32> [[TMP14]], <256 x i32>* [[TMP15]], align 64 +; CHECK-NEXT: ret void +; + %4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 0 + %5 = load i16, i16* %4, align 64 + %6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 1 + %7 = load i16, i16* %6, align 2 + %8 = shl i64 %2, 32 + %9 = ashr exact i64 %8, 32 + %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %5, i16 %7, i8* %1, i64 %9) + %11 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) + %12 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 2 + store <256 x i32> %11, <256 x i32>* %12, align 64 + ret void +} + +define dso_local void @__tile_dpbssd(%struct.__tile_str* nocapture %0, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %1, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr { +; CHECK-LABEL: @__tile_dpbssd( +; CHECK-NEXT: [[TMP4:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP5:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP6:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP7:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP1:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP8]], align 64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2:%.*]], i64 0, i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP10]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP12]], align 2 +; CHECK-NEXT: [[TMP14:%.*]] = udiv i16 [[TMP13]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = load <256 x i32>, <256 x i32>* [[TMP15]], align 64 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <256 x i32>* [[TMP7]] to i8* +; CHECK-NEXT: store <256 x i32> [[TMP16]], <256 x i32>* [[TMP7]], align 1024 +; CHECK-NEXT: [[TMP18:%.*]] = sext i16 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP11]], i8* [[TMP17]], i64 [[TMP18]]) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = load <256 x i32>, <256 x i32>* [[TMP20]], align 64 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <256 x i32>* [[TMP6]] to i8* +; CHECK-NEXT: store <256 x i32> [[TMP21]], <256 x i32>* [[TMP6]], align 1024 +; CHECK-NEXT: [[TMP23:%.*]] = sext i16 [[TMP13]] to i64 +; CHECK-NEXT: [[TMP24:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP13]], i8* [[TMP22]], i64 [[TMP23]]) +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2 +; CHECK-NEXT: [[TMP26:%.*]] = load <256 x i32>, <256 x i32>* [[TMP25]], align 64 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast <256 x i32>* [[TMP5]] to i8* +; CHECK-NEXT: store <256 x i32> [[TMP26]], <256 x i32>* [[TMP5]], align 1024 +; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP29:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP14]], i16 [[TMP11]], i8* [[TMP27]], i64 [[TMP28]]) +; CHECK-NEXT: [[TMP30:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP9]], i16 [[TMP11]], i16 [[TMP13]], x86_amx [[TMP19]], x86_amx [[TMP24]], x86_amx [[TMP29]]) +; CHECK-NEXT: [[TMP31:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8* +; CHECK-NEXT: [[TMP32:%.*]] = sext i16 [[TMP11]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP9]], i16 [[TMP11]], i8* [[TMP31]], i64 [[TMP32]], x86_amx [[TMP30]]) +; CHECK-NEXT: [[TMP33:%.*]] = load <256 x i32>, <256 x i32>* [[TMP4]], align 1024 +; CHECK-NEXT: store <256 x i32> [[TMP33]], <256 x i32>* [[TMP15]], align 64 +; CHECK-NEXT: ret void +; + %4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 0 + %5 = load i16, i16* %4, align 64 + %6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 1 + %7 = load i16, i16* %6, align 2 + %8 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 1 + %9 = load i16, i16* %8, align 2 + %10 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 2 + %11 = load <256 x i32>, <256 x i32>* %10, align 64 + %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) + %13 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 2 + %14 = load <256 x i32>, <256 x i32>* %13, align 64 + %15 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %14) + %16 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 2 + %17 = load <256 x i32>, <256 x i32>* %16, align 64 + %18 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %17) + %19 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %5, i16 %7, i16 %9, x86_amx %12, x86_amx %15, x86_amx %18) + %20 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %19) + store <256 x i32> %20, <256 x i32>* %10, align 64 + ret void +} + +define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) { +; CHECK-LABEL: @__tile_dpbsud( +; CHECK-NEXT: [[TMP1:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP3:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP4:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP5:%.*]] = udiv i16 [[K:%.*]], 4 +; CHECK-NEXT: [[T0:%.*]] = load <256 x i32>, <256 x i32>* [[PA:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8* +; CHECK-NEXT: store <256 x i32> [[T0]], <256 x i32>* [[TMP4]], align 1024 +; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[K]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP6]], i64 [[TMP7]]) +; CHECK-NEXT: [[T2:%.*]] = load <256 x i32>, <256 x i32>* [[PB:%.*]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <256 x i32>* [[TMP3]] to i8* +; CHECK-NEXT: store <256 x i32> [[T2]], <256 x i32>* [[TMP3]], align 1024 +; CHECK-NEXT: [[TMP10:%.*]] = sext i16 [[N:%.*]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[N]], i8* [[TMP9]], i64 [[TMP10]]) +; CHECK-NEXT: [[T4:%.*]] = load <256 x i32>, <256 x i32>* [[PC:%.*]], align 64 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP2]] to i8* +; CHECK-NEXT: store <256 x i32> [[T4]], <256 x i32>* [[TMP2]], align 1024 +; CHECK-NEXT: [[TMP13:%.*]] = sext i16 [[N]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP12]], i64 [[TMP13]]) +; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP14]], x86_amx [[TMP8]], x86_amx [[TMP11]]) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <256 x i32>* [[TMP1]] to i8* +; CHECK-NEXT: [[TMP16:%.*]] = sext i16 [[N]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP15]], i64 [[TMP16]], x86_amx [[T6]]) +; CHECK-NEXT: [[TMP17:%.*]] = load <256 x i32>, <256 x i32>* [[TMP1]], align 1024 +; CHECK-NEXT: store <256 x i32> [[TMP17]], <256 x i32>* [[PC]], align 64 +; CHECK-NEXT: ret void +; + %t0 = load <256 x i32>, <256 x i32>* %pa, align 64 + %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) + %t2 = load <256 x i32>, <256 x i32>* %pb, align 64 + %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) + %t4 = load <256 x i32>, <256 x i32>* %pc, align 64 + %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) + %t6 = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) + %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6) + store <256 x i32> %t7, <256 x i32>* %pc, align 64 + ret void +} + +define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) { +; CHECK-LABEL: @__tile_dpbusd( +; CHECK-NEXT: [[TMP1:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP3:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP4:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP5:%.*]] = udiv i16 [[K:%.*]], 4 +; CHECK-NEXT: [[T0:%.*]] = load <256 x i32>, <256 x i32>* [[PA:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8* +; CHECK-NEXT: store <256 x i32> [[T0]], <256 x i32>* [[TMP4]], align 1024 +; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[K]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP6]], i64 [[TMP7]]) +; CHECK-NEXT: [[T2:%.*]] = load <256 x i32>, <256 x i32>* [[PB:%.*]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <256 x i32>* [[TMP3]] to i8* +; CHECK-NEXT: store <256 x i32> [[T2]], <256 x i32>* [[TMP3]], align 1024 +; CHECK-NEXT: [[TMP10:%.*]] = sext i16 [[N:%.*]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[N]], i8* [[TMP9]], i64 [[TMP10]]) +; CHECK-NEXT: [[T4:%.*]] = load <256 x i32>, <256 x i32>* [[PC:%.*]], align 64 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP2]] to i8* +; CHECK-NEXT: store <256 x i32> [[T4]], <256 x i32>* [[TMP2]], align 1024 +; CHECK-NEXT: [[TMP13:%.*]] = sext i16 [[N]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP12]], i64 [[TMP13]]) +; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP14]], x86_amx [[TMP8]], x86_amx [[TMP11]]) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <256 x i32>* [[TMP1]] to i8* +; CHECK-NEXT: [[TMP16:%.*]] = sext i16 [[N]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP15]], i64 [[TMP16]], x86_amx [[T6]]) +; CHECK-NEXT: [[TMP17:%.*]] = load <256 x i32>, <256 x i32>* [[TMP1]], align 1024 +; CHECK-NEXT: store <256 x i32> [[TMP17]], <256 x i32>* [[PC]], align 64 +; CHECK-NEXT: ret void +; + %t0 = load <256 x i32>, <256 x i32>* %pa, align 64 + %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) + %t2 = load <256 x i32>, <256 x i32>* %pb, align 64 + %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) + %t4 = load <256 x i32>, <256 x i32>* %pc, align 64 + %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) + %t6 = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) + %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6) + store <256 x i32> %t7, <256 x i32>* %pc, align 64 + ret void +} + +define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) { +; CHECK-LABEL: @__tile_dpbuud( +; CHECK-NEXT: [[TMP1:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP3:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP4:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP5:%.*]] = udiv i16 [[K:%.*]], 4 +; CHECK-NEXT: [[T0:%.*]] = load <256 x i32>, <256 x i32>* [[PA:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8* +; CHECK-NEXT: store <256 x i32> [[T0]], <256 x i32>* [[TMP4]], align 1024 +; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[K]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP6]], i64 [[TMP7]]) +; CHECK-NEXT: [[T2:%.*]] = load <256 x i32>, <256 x i32>* [[PB:%.*]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <256 x i32>* [[TMP3]] to i8* +; CHECK-NEXT: store <256 x i32> [[T2]], <256 x i32>* [[TMP3]], align 1024 +; CHECK-NEXT: [[TMP10:%.*]] = sext i16 [[N:%.*]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[N]], i8* [[TMP9]], i64 [[TMP10]]) +; CHECK-NEXT: [[T4:%.*]] = load <256 x i32>, <256 x i32>* [[PC:%.*]], align 64 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP2]] to i8* +; CHECK-NEXT: store <256 x i32> [[T4]], <256 x i32>* [[TMP2]], align 1024 +; CHECK-NEXT: [[TMP13:%.*]] = sext i16 [[N]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP12]], i64 [[TMP13]]) +; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP14]], x86_amx [[TMP8]], x86_amx [[TMP11]]) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <256 x i32>* [[TMP1]] to i8* +; CHECK-NEXT: [[TMP16:%.*]] = sext i16 [[N]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP15]], i64 [[TMP16]], x86_amx [[T6]]) +; CHECK-NEXT: [[TMP17:%.*]] = load <256 x i32>, <256 x i32>* [[TMP1]], align 1024 +; CHECK-NEXT: store <256 x i32> [[TMP17]], <256 x i32>* [[PC]], align 64 +; CHECK-NEXT: ret void +; + %t0 = load <256 x i32>, <256 x i32>* %pa, align 64 + %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) + %t2 = load <256 x i32>, <256 x i32>* %pb, align 64 + %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) + %t4 = load <256 x i32>, <256 x i32>* %pc, align 64 + %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) + %t6 = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) + %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6) + store <256 x i32> %t7, <256 x i32>* %pc, align 64 + ret void +} + +define dso_local void @__tile_dpbf16ps(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) { +; CHECK-LABEL: @__tile_dpbf16ps( +; CHECK-NEXT: [[TMP1:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP2:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP3:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP4:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP5:%.*]] = udiv i16 [[K:%.*]], 4 +; CHECK-NEXT: [[T0:%.*]] = load <256 x i32>, <256 x i32>* [[PA:%.*]], align 64 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8* +; CHECK-NEXT: store <256 x i32> [[T0]], <256 x i32>* [[TMP4]], align 1024 +; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[K]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP6]], i64 [[TMP7]]) +; CHECK-NEXT: [[T2:%.*]] = load <256 x i32>, <256 x i32>* [[PB:%.*]], align 64 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <256 x i32>* [[TMP3]] to i8* +; CHECK-NEXT: store <256 x i32> [[T2]], <256 x i32>* [[TMP3]], align 1024 +; CHECK-NEXT: [[TMP10:%.*]] = sext i16 [[N:%.*]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[N]], i8* [[TMP9]], i64 [[TMP10]]) +; CHECK-NEXT: [[T4:%.*]] = load <256 x i32>, <256 x i32>* [[PC:%.*]], align 64 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP2]] to i8* +; CHECK-NEXT: store <256 x i32> [[T4]], <256 x i32>* [[TMP2]], align 1024 +; CHECK-NEXT: [[TMP13:%.*]] = sext i16 [[N]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP12]], i64 [[TMP13]]) +; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP14]], x86_amx [[TMP8]], x86_amx [[TMP11]]) +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <256 x i32>* [[TMP1]] to i8* +; CHECK-NEXT: [[TMP16:%.*]] = sext i16 [[N]] to i64 +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP15]], i64 [[TMP16]], x86_amx [[T6]]) +; CHECK-NEXT: [[TMP17:%.*]] = load <256 x i32>, <256 x i32>* [[TMP1]], align 1024 +; CHECK-NEXT: store <256 x i32> [[TMP17]], <256 x i32>* [[PC]], align 64 +; CHECK-NEXT: ret void +; + %t0 = load <256 x i32>, <256 x i32>* %pa, align 64 + %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) + %t2 = load <256 x i32>, <256 x i32>* %pb, align 64 + %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) + %t4 = load <256 x i32>, <256 x i32>* %pc, align 64 + %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) + %t6 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) + %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6) + store <256 x i32> %t7, <256 x i32>* %pc, align 64 + ret void +} + +define dso_local void @__tile_stored(i8* %0, i64 %1, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr { +; CHECK-LABEL: @__tile_stored( +; CHECK-NEXT: [[TMP4:%.*]] = alloca <256 x i32>, align 64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP2:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP5]], align 64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[TMP7]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = load <256 x i32>, <256 x i32>* [[TMP9]], align 64 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8* +; CHECK-NEXT: store <256 x i32> [[TMP10]], <256 x i32>* [[TMP4]], align 1024 +; CHECK-NEXT: [[TMP12:%.*]] = sext i16 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP6]], i16 [[TMP8]], i8* [[TMP11]], i64 [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = shl i64 [[TMP1:%.*]], 32 +; CHECK-NEXT: [[TMP15:%.*]] = ashr exact i64 [[TMP14]], 32 +; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[TMP6]], i16 [[TMP8]], i8* [[TMP0:%.*]], i64 [[TMP15]], x86_amx [[TMP13]]) +; CHECK-NEXT: ret void +; + %4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 0 + %5 = load i16, i16* %4, align 64 + %6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 1 + %7 = load i16, i16* %6, align 2 + %8 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 2 + %9 = load <256 x i32>, <256 x i32>* %8, align 64 + %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) + %11 = shl i64 %1, 32 + %12 = ashr exact i64 %11, 32 + tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %7, i8* %0, i64 %12, x86_amx %10) + ret void +} + +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) + +declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) +declare x86_amx @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32>) +declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) +declare <225 x i32> @llvm.x86.cast.tile.to.vector.v225i32(x86_amx) From 583a7542480b8508803b8ffa2f3dbd09a0bf346b Mon Sep 17 00:00:00 2001 From: Tobias Gysi Date: Tue, 17 Aug 2021 07:04:21 +0000 Subject: [PATCH 190/700] [mlir][linalg] Remove duplicate methods (NFC). Remove duplicate methods used to check iterator types. Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D108102 --- .../include/mlir/Dialect/Linalg/Utils/Utils.h | 13 -------- .../Linalg/Transforms/Distribution.cpp | 2 +- .../Linalg/Transforms/Vectorization.cpp | 4 +-- mlir/lib/Dialect/Linalg/Utils/Utils.cpp | 31 +++---------------- .../Transforms/Sparsification.cpp | 2 +- 5 files changed, 9 insertions(+), 43 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h index b03fde6e9b370..81ab7eaa08866 100644 --- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h @@ -56,19 +56,6 @@ SmallVector getDynOperands(Location loc, Value val, OpBuilder &b); /// Otherwise return nullptr. IntegerAttr getSmallestBoundingIndex(Value size); -//===----------------------------------------------------------------------===// -// Iterator type utilities -//===----------------------------------------------------------------------===// - -/// Checks if an iterator_type attribute is parallel. -bool isParallelIteratorType(Attribute attr); - -/// Checks if an iterator_type attribute is parallel. -bool isReductionIteratorType(Attribute attr); - -/// Checks if an iterator_type attribute is parallel. -bool isWindowIteratorType(Attribute attr); - //===----------------------------------------------------------------------===// // Fusion utilities //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp b/mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp index 994f7c76ddfda..e951d68820220 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp @@ -53,7 +53,7 @@ struct DistributeTiledLoopPattern if (procInfoCallback == options.procInfoMap.end()) continue; - if (!isParallelIteratorType(op.iterator_types()[i])) { + if (!isParallelIterator(op.iterator_types()[i])) { op.emitOpError("only support for parallel loops is implemented"); return failure(); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index abb4328b08f10..0a622e8335c90 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -210,7 +210,7 @@ static Value reduceIfNeeded(OpBuilder &b, VectorType targetVectorType, unsigned idx = 0; SmallVector reductionMask(linalgOp.iterator_types().size(), false); for (auto attr : linalgOp.iterator_types()) { - if (isReductionIteratorType(attr)) + if (isReductionIterator(attr)) reductionMask[idx] = true; ++idx; } @@ -615,7 +615,7 @@ static bool allIndexingsAreProjectedPermutation(LinalgOp op) { // TODO: probably need some extra checks for reduction followed by consumer // ops that may not commute (e.g. linear reduction + non-linear instructions). static LogicalResult reductionPreconditions(LinalgOp op) { - if (llvm::none_of(op.iterator_types(), isReductionIteratorType)) + if (llvm::none_of(op.iterator_types(), isReductionIterator)) return failure(); for (OpOperand *opOperand : op.getOutputOperands()) { Operation *reductionOp = getSingleBinaryOpAssumedReduction(opOperand); diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp index 1620a047390be..596ae49232c6d 100644 --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -116,27 +116,6 @@ RegionMatcher::matchAsScalarBinaryOp(GenericOp op) { return llvm::None; } -bool mlir::linalg::isParallelIteratorType(Attribute attr) { - if (auto strAttr = attr.dyn_cast()) { - return strAttr.getValue() == getParallelIteratorTypeName(); - } - return false; -} - -bool mlir::linalg::isReductionIteratorType(Attribute attr) { - if (auto strAttr = attr.dyn_cast()) { - return strAttr.getValue() == getReductionIteratorTypeName(); - } - return false; -} - -bool mlir::linalg::isWindowIteratorType(Attribute attr) { - if (auto strAttr = attr.dyn_cast()) { - return strAttr.getValue() == getWindowIteratorTypeName(); - } - return false; -} - /// Explicit instantiation of loop nest generator for different loop types. template struct mlir::linalg::GenerateLoopNest; template struct mlir::linalg::GenerateLoopNest; @@ -233,7 +212,7 @@ void GenerateLoopNest::doit( // Collect loop ranges for parallel dimensions. SmallVector parallelLoopRanges; for (auto iteratorType : enumerate(iteratorTypes)) - if (isParallelIteratorType(iteratorType.value())) + if (isParallelIterator(iteratorType.value())) parallelLoopRanges.push_back(loopRanges[iteratorType.index()]); // Get their distribution schemes. @@ -254,7 +233,7 @@ void GenerateLoopNest::doit( // Filter out scf.for loops that were created out of parallel dimensions. SmallVector loops; for (auto iteratorType : enumerate(iteratorTypes)) - if (isParallelIteratorType(iteratorType.value())) + if (isParallelIterator(iteratorType.value())) loops.push_back(loopNest.loops[iteratorType.index()]); // Distribute - only supports cyclic distribution for now. @@ -375,7 +354,7 @@ static void generateParallelLoopNest( // Find the outermost parallel loops and drop their types from the list. unsigned nLoops = iteratorTypes.size(); unsigned nOuterPar = - nLoops - iteratorTypes.drop_while(isParallelIteratorType).size(); + nLoops - iteratorTypes.drop_while(isParallelIterator).size(); // If there are no outer parallel loops, generate one sequential loop and // recurse. Note that we wouldn't have dropped anything from `iteratorTypes` @@ -502,7 +481,7 @@ void GenerateLoopNest::doit( distributionOptions->distributionMethod.end()); SmallVector parallelLoopRanges; for (auto iteratorType : enumerate(iteratorTypes)) { - if (isParallelIteratorType(iteratorType.value())) + if (isParallelIterator(iteratorType.value())) parallelLoopRanges.push_back(loopRanges[iteratorType.index()]); } if (distributionMethod.size() < parallelLoopRanges.size()) @@ -513,7 +492,7 @@ void GenerateLoopNest::doit( for (auto iteratorType : enumerate(iteratorTypes)) { if (index >= procInfo.size()) break; - if (isParallelIteratorType(iteratorType.value())) { + if (isParallelIterator(iteratorType.value())) { unsigned i = iteratorType.index(); updateBoundsForCyclicDistribution(b, loc, procInfo[index].procId, procInfo[index].nprocs, lbsStorage[i], diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp index b2c64e450a84f..2567693c4b641 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -743,7 +743,7 @@ static Operation *genFor(Merger &merger, CodeGen &codegen, unsigned tensor = merger.tensor(fb); assert(idx == merger.index(fb)); auto iteratorTypes = op.iterator_types().getValue(); - bool isReduction = linalg::isReductionIteratorType(iteratorTypes[idx]); + bool isReduction = isReductionIterator(iteratorTypes[idx]); bool isSparse = merger.isDim(fb, Dim::kSparse); bool isVector = isVectorFor(codegen, isInner, isSparse) && denseUnitStrides(merger, op, idx); From 6d952b08bdac145879fcfb7f57600ff02d16773e Mon Sep 17 00:00:00 2001 From: PeixinQiao Date: Tue, 17 Aug 2021 17:16:37 +0800 Subject: [PATCH 191/700] [NFC] Fix typos Initial commit test. --- llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 9e242b04cc6a5..c90659a959706 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -835,7 +835,7 @@ class OpenMPIRBuilder { /// \param BodyGenCB Callback that will generate the region code. /// \param FiniCB Callback to finialize variable copies. /// - /// \returns The insertion position *after* the master. + /// \returns The insertion position *after* the masked. InsertPointTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter); @@ -848,7 +848,7 @@ class OpenMPIRBuilder { /// \param CriticalName name of the lock used by the critical directive /// \param HintInst Hint Instruction for hint clause associated with critical /// - /// \returns The insertion position *after* the master. + /// \returns The insertion position *after* the critical. InsertPointTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, From 0deedaa23f71587d121a5ffab40571fd4a64599a Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Wed, 11 Aug 2021 14:25:50 +0100 Subject: [PATCH 192/700] [hwasan] Prevent reordering of tag checks. They were previously unconstrained, which allowed them to be reordered before the shadow memory write. Reviewed By: eugenis Differential Revision: https://reviews.llvm.org/D107901 --- llvm/include/llvm/IR/Intrinsics.td | 4 ++-- .../HWAddressSanitizer/memaccess-clobber.ll | 20 +++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/memaccess-clobber.ll diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 61165ab9e1369..b4ae7b1dd5866 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1570,10 +1570,10 @@ def int_load_relative: DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_a def int_hwasan_check_memaccess : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, ImmArg>]>; + [ImmArg>]>; def int_hwasan_check_memaccess_shortgranules : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty], - [IntrInaccessibleMemOnly, ImmArg>]>; + [ImmArg>]>; // Xray intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/memaccess-clobber.ll b/llvm/test/Instrumentation/HWAddressSanitizer/memaccess-clobber.ll new file mode 100644 index 0000000000000..2a23079e532ce --- /dev/null +++ b/llvm/test/Instrumentation/HWAddressSanitizer/memaccess-clobber.ll @@ -0,0 +1,20 @@ +; Make sure memaccess checks preceed the following reads. +; +; RUN: opt < %s -S -enable-new-pm=0 -hwasan -basic-aa -memdep -print-memdeps -analyze -mtriple aarch64-linux-android30 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-android10000" + +declare void @use32(i32*) + +define i32 @test_alloca() sanitize_hwaddress { +entry: + %x = alloca i32, align 4 + call void @use32(i32* nonnull %x) + ; CHECK: Clobber from: call void @llvm.hwasan.check.memaccess.shortgranule + ; CHECK-NEXT: load i32, i32* %x.hwasan, align 4 + %y = load i32, i32* %x + ; CHECK: Clobber from: %y = load i32, i32* %x.hwasan, align 4 + ; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 {{.*}}, i8 0, i64 1, i1 false) + ret i32 %y +} From 8f8f9260a95f784e8c4620d652986724ac1b88df Mon Sep 17 00:00:00 2001 From: Anton Afanasyev Date: Thu, 12 Aug 2021 14:51:57 +0300 Subject: [PATCH 193/700] [Test][AggressiveInstCombine] Add test for shifts Precommit test for D107766/D108091. Also move fixed test for PR50555 from SLPVectorizer/X86/ to PhaseOrdering/X86/ subdirectory. --- .../AggressiveInstCombine/trunc_shifts.ll | 362 ++++++++++++++++++ .../X86/pr50555.ll | 4 +- 2 files changed, 364 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll rename llvm/test/Transforms/{SLPVectorizer => PhaseOrdering}/X86/pr50555.ll (98%) diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll new file mode 100644 index 0000000000000..67d78293564e7 --- /dev/null +++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll @@ -0,0 +1,362 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -aggressive-instcombine -S | FileCheck %s + +define i16 @shl_1(i8 %x) { +; CHECK-LABEL: @shl_1( +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[ZEXT]], 1 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] +; + %zext = zext i8 %x to i32 + %shl = shl i32 %zext, 1 + %trunc = trunc i32 %shl to i16 + ret i16 %trunc +} + +define i16 @shl_15(i8 %x) { +; CHECK-LABEL: @shl_15( +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[ZEXT]], 15 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] +; + %zext = zext i8 %x to i32 + %shl = shl i32 %zext, 15 + %trunc = trunc i32 %shl to i16 + ret i16 %trunc +} + +; Negative test - shift amount isn't less than target bitwidth + +define i16 @shl_16(i8 %x) { +; CHECK-LABEL: @shl_16( +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[ZEXT]], 16 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] +; + %zext = zext i8 %x to i32 + %shl = shl i32 %zext, 16 + %trunc = trunc i32 %shl to i16 + ret i16 %trunc +} + +; Negative test -- variable shift amount + +define i16 @shl_var_shift_amount(i8 %x, i8 %y) { +; CHECK-LABEL: @shl_var_shift_amount( +; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[ZEXT_X]], [[ZEXT_Y]] +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] +; + %zext.x = zext i8 %x to i32 + %zext.y = zext i8 %y to i32 + %shl = shl i32 %zext.x, %zext.y + %trunc = trunc i32 %shl to i16 + ret i16 %trunc +} + +define i16 @shl_var_bounded_shift_amount(i8 %x, i8 %y) { +; CHECK-LABEL: @shl_var_bounded_shift_amount( +; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[ZEXT_Y]], 15 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[ZEXT_X]], [[AND]] +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] +; + %zext.x = zext i8 %x to i32 + %zext.y = zext i8 %y to i32 + %and = and i32 %zext.y, 15 + %shl = shl i32 %zext.x, %and + %trunc = trunc i32 %shl to i16 + ret i16 %trunc +} + +define <2 x i16> @shl_vector(<2 x i8> %x) { +; CHECK-LABEL: @shl_vector( +; CHECK-NEXT: [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32> +; CHECK-NEXT: [[S:%.*]] = shl <2 x i32> [[Z]], +; CHECK-NEXT: [[T:%.*]] = trunc <2 x i32> [[S]] to <2 x i16> +; CHECK-NEXT: ret <2 x i16> [[T]] +; + %z = zext <2 x i8> %x to <2 x i32> + %s = shl <2 x i32> %z, + %t = trunc <2 x i32> %s to <2 x i16> + ret <2 x i16> %t +} + +; Negative test - can only fold to <2 x i16>, requiring new vector type + +define <2 x i8> @shl_vector_no_new_vector_type(<2 x i8> %x) { +; CHECK-LABEL: @shl_vector_no_new_vector_type( +; CHECK-NEXT: [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32> +; CHECK-NEXT: [[S:%.*]] = shl <2 x i32> [[Z]], +; CHECK-NEXT: [[T:%.*]] = trunc <2 x i32> [[S]] to <2 x i8> +; CHECK-NEXT: ret <2 x i8> [[T]] +; + %z = zext <2 x i8> %x to <2 x i32> + %s = shl <2 x i32> %z, + %t = trunc <2 x i32> %s to <2 x i8> + ret <2 x i8> %t +} + +; Negative test + +define <2 x i16> @shl_vector_large_shift_amount(<2 x i8> %x) { +; CHECK-LABEL: @shl_vector_large_shift_amount( +; CHECK-NEXT: [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32> +; CHECK-NEXT: [[S:%.*]] = shl <2 x i32> [[Z]], +; CHECK-NEXT: [[T:%.*]] = trunc <2 x i32> [[S]] to <2 x i16> +; CHECK-NEXT: ret <2 x i16> [[T]] +; + %z = zext <2 x i8> %x to <2 x i32> + %s = shl <2 x i32> %z, + %t = trunc <2 x i32> %s to <2 x i16> + ret <2 x i16> %t +} + +define i16 @shl_nuw(i8 %x) { +; CHECK-LABEL: @shl_nuw( +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[S:%.*]] = shl nuw i32 [[Z]], 15 +; CHECK-NEXT: [[T:%.*]] = trunc i32 [[S]] to i16 +; CHECK-NEXT: ret i16 [[T]] +; + %z = zext i8 %x to i32 + %s = shl nuw i32 %z, 15 + %t = trunc i32 %s to i16 + ret i16 %t +} + +define i16 @shl_nsw(i8 %x) { +; CHECK-LABEL: @shl_nsw( +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[S:%.*]] = shl nsw i32 [[Z]], 15 +; CHECK-NEXT: [[T:%.*]] = trunc i32 [[S]] to i16 +; CHECK-NEXT: ret i16 [[T]] +; + %z = zext i8 %x to i32 + %s = shl nsw i32 %z, 15 + %t = trunc i32 %s to i16 + ret i16 %t +} + +define i16 @lshr_15(i16 %x) { +; CHECK-LABEL: @lshr_15( +; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32 +; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[ZEXT]], 15 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[LSHR]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] +; + %zext = zext i16 %x to i32 + %lshr = lshr i32 %zext, 15 + %trunc = trunc i32 %lshr to i16 + ret i16 %trunc +} + +; Negative test + +define i16 @lshr_16(i16 %x) { +; CHECK-LABEL: @lshr_16( +; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32 +; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[ZEXT]], 16 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[LSHR]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] +; + %zext = zext i16 %x to i32 + %lshr = lshr i32 %zext, 16 + %trunc = trunc i32 %lshr to i16 + ret i16 %trunc +} + +; Negative test + +define i16 @lshr_var_shift_amount(i8 %x, i8 %amt) { +; CHECK-LABEL: @lshr_var_shift_amount( +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[ZA:%.*]] = zext i8 [[AMT:%.*]] to i32 +; CHECK-NEXT: [[S:%.*]] = lshr i32 [[Z]], [[ZA]] +; CHECK-NEXT: [[A:%.*]] = add i32 [[S]], [[Z]] +; CHECK-NEXT: [[S2:%.*]] = lshr i32 [[A]], 2 +; CHECK-NEXT: [[T:%.*]] = trunc i32 [[S2]] to i16 +; CHECK-NEXT: ret i16 [[T]] +; + %z = zext i8 %x to i32 + %za = zext i8 %amt to i32 + %s = lshr i32 %z, %za + %a = add i32 %s, %z + %s2 = lshr i32 %a, 2 + %t = trunc i32 %s2 to i16 + ret i16 %t +} + +define i16 @lshr_var_bounded_shift_amount(i8 %x, i8 %amt) { +; CHECK-LABEL: @lshr_var_bounded_shift_amount( +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[ZA:%.*]] = zext i8 [[AMT:%.*]] to i32 +; CHECK-NEXT: [[ZA2:%.*]] = and i32 [[ZA]], 15 +; CHECK-NEXT: [[S:%.*]] = lshr i32 [[Z]], [[ZA2]] +; CHECK-NEXT: [[A:%.*]] = add i32 [[S]], [[Z]] +; CHECK-NEXT: [[S2:%.*]] = lshr i32 [[A]], 2 +; CHECK-NEXT: [[T:%.*]] = trunc i32 [[S2]] to i16 +; CHECK-NEXT: ret i16 [[T]] +; + %z = zext i8 %x to i32 + %za = zext i8 %amt to i32 + %za2 = and i32 %za, 15 + %s = lshr i32 %z, %za2 + %a = add i32 %s, %z + %s2 = lshr i32 %a, 2 + %t = trunc i32 %s2 to i16 + ret i16 %t +} + +define void @lshr_big_dag(i16* %a, i8 %b, i8 %c) { +; CHECK-LABEL: @lshr_big_dag( +; CHECK-NEXT: [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i32 +; CHECK-NEXT: [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i32 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[ZEXT1]], [[ZEXT2]] +; CHECK-NEXT: [[SFT1:%.*]] = and i32 [[ADD1]], 15 +; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[ADD1]], [[SFT1]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[SHR1]] +; CHECK-NEXT: [[SFT2:%.*]] = and i32 [[ADD2]], 7 +; CHECK-NEXT: [[SHR2:%.*]] = lshr i32 [[ADD2]], [[SFT2]] +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHR2]] to i16 +; CHECK-NEXT: store i16 [[TRUNC]], i16* [[A:%.*]], align 2 +; CHECK-NEXT: ret void +; + %zext1 = zext i8 %b to i32 + %zext2 = zext i8 %c to i32 + %add1 = add i32 %zext1, %zext2 + %sft1 = and i32 %add1, 15 + %shr1 = lshr i32 %add1, %sft1 + %add2 = add i32 %add1, %shr1 + %sft2 = and i32 %add2, 7 + %shr2 = lshr i32 %add2, %sft2 + %trunc = trunc i32 %shr2 to i16 + store i16 %trunc, i16* %a, align 2 + ret void +} + +define <2 x i16> @lshr_vector(<2 x i8> %x) { +; CHECK-LABEL: @lshr_vector( +; CHECK-NEXT: [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32> +; CHECK-NEXT: [[ZA:%.*]] = and <2 x i32> [[Z]], +; CHECK-NEXT: [[S:%.*]] = lshr <2 x i32> [[Z]], [[ZA]] +; CHECK-NEXT: [[A:%.*]] = add <2 x i32> [[S]], [[Z]] +; CHECK-NEXT: [[S2:%.*]] = lshr <2 x i32> [[A]], +; CHECK-NEXT: [[T:%.*]] = trunc <2 x i32> [[S2]] to <2 x i16> +; CHECK-NEXT: ret <2 x i16> [[T]] +; + %z = zext <2 x i8> %x to <2 x i32> + %za = and <2 x i32> %z, + %s = lshr <2 x i32> %z, %za + %a = add <2 x i32> %s, %z + %s2 = lshr <2 x i32> %a, + %t = trunc <2 x i32> %s2 to <2 x i16> + ret <2 x i16> %t +} + +; Negative test - can only fold to <2 x i16>, requiring new vector type + +define <2 x i8> @lshr_vector_no_new_vector_type(<2 x i8> %x) { +; CHECK-LABEL: @lshr_vector_no_new_vector_type( +; CHECK-NEXT: [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32> +; CHECK-NEXT: [[ZA:%.*]] = and <2 x i32> [[Z]], +; CHECK-NEXT: [[S:%.*]] = lshr <2 x i32> [[Z]], [[ZA]] +; CHECK-NEXT: [[A:%.*]] = add <2 x i32> [[S]], [[Z]] +; CHECK-NEXT: [[S2:%.*]] = lshr <2 x i32> [[A]], +; CHECK-NEXT: [[T:%.*]] = trunc <2 x i32> [[S2]] to <2 x i8> +; CHECK-NEXT: ret <2 x i8> [[T]] +; + %z = zext <2 x i8> %x to <2 x i32> + %za = and <2 x i32> %z, + %s = lshr <2 x i32> %z, %za + %a = add <2 x i32> %s, %z + %s2 = lshr <2 x i32> %a, + %t = trunc <2 x i32> %s2 to <2 x i8> + ret <2 x i8> %t +} + +; Negative test + +define <2 x i16> @lshr_vector_large_shift_amount(<2 x i8> %x) { +; CHECK-LABEL: @lshr_vector_large_shift_amount( +; CHECK-NEXT: [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32> +; CHECK-NEXT: [[ZA:%.*]] = and <2 x i32> [[Z]], +; CHECK-NEXT: [[S:%.*]] = lshr <2 x i32> [[Z]], [[ZA]] +; CHECK-NEXT: [[A:%.*]] = add <2 x i32> [[S]], [[Z]] +; CHECK-NEXT: [[S2:%.*]] = lshr <2 x i32> [[A]], +; CHECK-NEXT: [[T:%.*]] = trunc <2 x i32> [[S2]] to <2 x i16> +; CHECK-NEXT: ret <2 x i16> [[T]] +; + %z = zext <2 x i8> %x to <2 x i32> + %za = and <2 x i32> %z, + %s = lshr <2 x i32> %z, %za + %a = add <2 x i32> %s, %z + %s2 = lshr <2 x i32> %a, + %t = trunc <2 x i32> %s2 to <2 x i16> + ret <2 x i16> %t +} + +define i16 @lshr_exact(i16 %x) { +; CHECK-LABEL: @lshr_exact( +; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32 +; CHECK-NEXT: [[LSHR:%.*]] = lshr exact i32 [[ZEXT]], 15 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[LSHR]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] +; + %zext = zext i16 %x to i32 + %lshr = lshr exact i32 %zext, 15 + %trunc = trunc i32 %lshr to i16 + ret i16 %trunc +} + +; Negative test + +define i16 @ashr_negative(i16 %x) { +; CHECK-LABEL: @ashr_negative( +; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32 +; CHECK-NEXT: [[ASHR:%.*]] = ashr i32 [[ZEXT]], 15 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[ASHR]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] +; + %zext = zext i16 %x to i32 + %ashr = ashr i32 %zext, 15 + %trunc = trunc i32 %ashr to i16 + ret i16 %trunc +} + +define i16 @ashr_positive(i16 %x) { +; CHECK-LABEL: @ashr_positive( +; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[ZEXT]], 32767 +; CHECK-NEXT: [[ASHR:%.*]] = ashr i32 [[AND]], 15 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[ASHR]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] +; + %zext = zext i16 %x to i32 + %and = and i32 %zext, 32767 + %ashr = ashr i32 %and, 15 + %trunc = trunc i32 %ashr to i16 + ret i16 %trunc +} + +define i16 @ashr_exact(i16 %x) { +; CHECK-LABEL: @ashr_exact( +; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[ZEXT]], 32767 +; CHECK-NEXT: [[ASHR:%.*]] = ashr exact i32 [[AND]], 15 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[ASHR]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] +; + %zext = zext i16 %x to i32 + %and = and i32 %zext, 32767 + %ashr = ashr exact i32 %and, 15 + %trunc = trunc i32 %ashr to i16 + ret i16 %trunc +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll similarity index 98% rename from llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll rename to llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll index 818ba6450fcb6..da457578a79c1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-- -aggressive-instcombine -slp-vectorizer -dce -S | FileCheck %s --check-prefixes=SSE -; RUN: opt < %s -mtriple=x86_64-- -mcpu=corei7-avx -aggressive-instcombine -slp-vectorizer -dce -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -O3 -S -mtriple=x86_64-- | FileCheck %s --check-prefixes=SSE +; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mattr=avx | FileCheck %s --check-prefixes=AVX define void @trunc_through_one_add(i16* noalias %0, i8* noalias readonly %1) { ; SSE-LABEL: @trunc_through_one_add( From 1f3e35b6d165715ec7bf7ba80d5b982719c7752a Mon Sep 17 00:00:00 2001 From: Anton Afanasyev Date: Thu, 12 Aug 2021 14:51:57 +0300 Subject: [PATCH 194/700] [AggressiveInstCombine] Add shift left instruction to `TruncInstCombine` DAG Add `shl` instruction to the DAG post-dominated by `trunc`, allowing TruncInstCombine to reduce bitwidth of expressions containing left shifts. The only thing we need to check is that the target bitwidth must be wider than the maximal shift amount: https://alive2.llvm.org/ce/z/AwArqu Part of https://reviews.llvm.org/D107766 Differential Revision: https://reviews.llvm.org/D108091 --- .../TruncInstCombine.cpp | 26 ++++++++++- .../AggressiveInstCombine/trunc_shifts.ll | 46 ++++++++----------- 2 files changed, 44 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp index 16b82219e8ca3..b614cfd7b9b09 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp @@ -29,10 +29,12 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" +#include "llvm/Support/KnownBits.h" using namespace llvm; @@ -61,6 +63,7 @@ static void getRelevantOperands(Instruction *I, SmallVectorImpl &Ops) { case Instruction::And: case Instruction::Or: case Instruction::Xor: + case Instruction::Shl: Ops.push_back(I->getOperand(0)); Ops.push_back(I->getOperand(1)); break; @@ -127,6 +130,7 @@ bool TruncInstCombine::buildTruncExpressionDag() { case Instruction::And: case Instruction::Or: case Instruction::Xor: + case Instruction::Shl: case Instruction::Select: { SmallVector Operands; getRelevantOperands(I, Operands); @@ -137,7 +141,7 @@ bool TruncInstCombine::buildTruncExpressionDag() { // TODO: Can handle more cases here: // 1. shufflevector, extractelement, insertelement // 2. udiv, urem - // 3. shl, lshr, ashr + // 3. lshr, ashr // 4. phi node(and loop handling) // ... return false; @@ -270,6 +274,23 @@ Type *TruncInstCombine::getBestTruncatedType() { unsigned OrigBitWidth = CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits(); + // Initialize MinBitWidth for `shl` instructions with the minimum number + // that is greater than shift amount (i.e. shift amount + 1). + // Also normalize MinBitWidth not to be greater than source bitwidth. + for (auto &Itr : InstInfoMap) { + Instruction *I = Itr.first; + if (I->getOpcode() == Instruction::Shl) { + KnownBits KnownRHS = computeKnownBits(I->getOperand(1), DL); + const unsigned SrcBitWidth = KnownRHS.getBitWidth(); + unsigned MinBitWidth = + KnownRHS.getMaxValue().uadd_sat(APInt(SrcBitWidth, 1)).getZExtValue(); + MinBitWidth = std::min(MinBitWidth, SrcBitWidth); + if (MinBitWidth >= OrigBitWidth) + return nullptr; + Itr.second.MinBitWidth = MinBitWidth; + } + } + // Calculate minimum allowed bit-width allowed for shrinking the currently // visited truncate's operand. unsigned MinBitWidth = getMinBitWidth(); @@ -356,7 +377,8 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) { case Instruction::Mul: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { + case Instruction::Xor: + case Instruction::Shl: { Value *LHS = getReducedOperand(I->getOperand(0), SclTy); Value *RHS = getReducedOperand(I->getOperand(1), SclTy); Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS); diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll index 67d78293564e7..e7f491aa51054 100644 --- a/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_shifts.ll @@ -3,10 +3,9 @@ define i16 @shl_1(i8 %x) { ; CHECK-LABEL: @shl_1( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[ZEXT]], 1 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16 -; CHECK-NEXT: ret i16 [[TRUNC]] +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[SHL:%.*]] = shl i16 [[ZEXT]], 1 +; CHECK-NEXT: ret i16 [[SHL]] ; %zext = zext i8 %x to i32 %shl = shl i32 %zext, 1 @@ -16,10 +15,9 @@ define i16 @shl_1(i8 %x) { define i16 @shl_15(i8 %x) { ; CHECK-LABEL: @shl_15( -; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[ZEXT]], 15 -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16 -; CHECK-NEXT: ret i16 [[TRUNC]] +; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[SHL:%.*]] = shl i16 [[ZEXT]], 15 +; CHECK-NEXT: ret i16 [[SHL]] ; %zext = zext i8 %x to i32 %shl = shl i32 %zext, 15 @@ -61,12 +59,11 @@ define i16 @shl_var_shift_amount(i8 %x, i8 %y) { define i16 @shl_var_bounded_shift_amount(i8 %x, i8 %y) { ; CHECK-LABEL: @shl_var_bounded_shift_amount( -; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[ZEXT_Y]], 15 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[ZEXT_X]], [[AND]] -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16 -; CHECK-NEXT: ret i16 [[TRUNC]] +; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i16 +; CHECK-NEXT: [[AND:%.*]] = and i16 [[ZEXT_Y]], 15 +; CHECK-NEXT: [[SHL:%.*]] = shl i16 [[ZEXT_X]], [[AND]] +; CHECK-NEXT: ret i16 [[SHL]] ; %zext.x = zext i8 %x to i32 %zext.y = zext i8 %y to i32 @@ -78,10 +75,9 @@ define i16 @shl_var_bounded_shift_amount(i8 %x, i8 %y) { define <2 x i16> @shl_vector(<2 x i8> %x) { ; CHECK-LABEL: @shl_vector( -; CHECK-NEXT: [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32> -; CHECK-NEXT: [[S:%.*]] = shl <2 x i32> [[Z]], -; CHECK-NEXT: [[T:%.*]] = trunc <2 x i32> [[S]] to <2 x i16> -; CHECK-NEXT: ret <2 x i16> [[T]] +; CHECK-NEXT: [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i16> +; CHECK-NEXT: [[S:%.*]] = shl <2 x i16> [[Z]], +; CHECK-NEXT: ret <2 x i16> [[S]] ; %z = zext <2 x i8> %x to <2 x i32> %s = shl <2 x i32> %z, @@ -121,10 +117,9 @@ define <2 x i16> @shl_vector_large_shift_amount(<2 x i8> %x) { define i16 @shl_nuw(i8 %x) { ; CHECK-LABEL: @shl_nuw( -; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[S:%.*]] = shl nuw i32 [[Z]], 15 -; CHECK-NEXT: [[T:%.*]] = trunc i32 [[S]] to i16 -; CHECK-NEXT: ret i16 [[T]] +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[S:%.*]] = shl i16 [[Z]], 15 +; CHECK-NEXT: ret i16 [[S]] ; %z = zext i8 %x to i32 %s = shl nuw i32 %z, 15 @@ -134,10 +129,9 @@ define i16 @shl_nuw(i8 %x) { define i16 @shl_nsw(i8 %x) { ; CHECK-LABEL: @shl_nsw( -; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i32 -; CHECK-NEXT: [[S:%.*]] = shl nsw i32 [[Z]], 15 -; CHECK-NEXT: [[T:%.*]] = trunc i32 [[S]] to i16 -; CHECK-NEXT: ret i16 [[T]] +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[S:%.*]] = shl i16 [[Z]], 15 +; CHECK-NEXT: ret i16 [[S]] ; %z = zext i8 %x to i32 %s = shl nsw i32 %z, 15 From 895ed64009c024f9e6608f574f9ab083e421ae57 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 17 Aug 2021 11:22:49 +0100 Subject: [PATCH 195/700] [AArch64] LowerCONCAT_VECTORS - merge getNumOperands() calls. NFCI. Improves on the unused variable fix from rG4357562067003e25ab343a2d67a60bd89cd66dbf --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 9bbeb151d56bc..883621973b07a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10461,11 +10461,10 @@ SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op, if (isTypeLegal(Op.getOperand(0).getValueType())) { unsigned NumOperands = Op->getNumOperands(); - (void)NumOperands; assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); - if (Op.getNumOperands() == 2) + if (NumOperands == 2) return Op; // Concat each pair of subvectors and pack into the lower half of the array. From 708cbda5771aecf84e93c4e7f5d6f78bbc92af6e Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Tue, 17 Aug 2021 11:32:41 +0100 Subject: [PATCH 196/700] [DebugInfo][InstrRef] Honour too-much-debug-info cutouts This reapplies 54a61c94f93, its follow up in 547b712500e, which were reverted 95fe61e63954. Original commit message: VarLoc based LiveDebugValues will abandon variable location propagation if there are too many blocks and variable assignments in the function. If it didn't, and we had (say) 1000 blocks and 1000 variables in scope, we'd end up with 1 million DBG_VALUEs just at the start of blocks. Instruction-referencing LiveDebugValues should honour this limitation too (because the same limitation applies to it). Hoist the relevant command line options into LiveDebugValues.cpp and pass it down into the implementation classes as an argument to ExtendRanges. I've duplicated all the run-lines in live-debug-values-cutoffs.mir to have an instruction-referencing flavour. Differential Revision: https://reviews.llvm.org/D107823 --- .../LiveDebugValues/InstrRefBasedImpl.cpp | 56 ++++++++++++------- .../LiveDebugValues/LiveDebugValues.cpp | 14 ++++- .../CodeGen/LiveDebugValues/LiveDebugValues.h | 4 +- .../LiveDebugValues/VarLocBasedImpl.cpp | 21 ++----- .../MIR/X86/live-debug-values-cutoffs.mir | 20 +++++++ 5 files changed, 79 insertions(+), 36 deletions(-) diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index dc99070583406..01338ab101a8c 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -1684,7 +1684,8 @@ class InstrRefBasedLDV : public LDVImpl { /// RPOT block ordering. void initialSetup(MachineFunction &MF); - bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) override; + bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, + unsigned InputBBLimit, unsigned InputDbgValLimit) override; public: /// Default construct and initialize the pass. @@ -3523,8 +3524,9 @@ void InstrRefBasedLDV::initialSetup(MachineFunction &MF) { /// Calculate the liveness information for the given machine function and /// extend ranges across basic blocks. -bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, - TargetPassConfig *TPC) { +bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, + unsigned InputBBLimit, + unsigned InputDbgValLimit) { // No subprogram means this function contains no debuginfo. if (!MF.getFunction().getSubprogram()) return false; @@ -3626,6 +3628,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, // To mirror old LiveDebugValues, enumerate variables in RPOT order. Otherwise // the order is unimportant, it just has to be stable. + unsigned VarAssignCount = 0; for (unsigned int I = 0; I < OrderToBB.size(); ++I) { auto *MBB = OrderToBB[I]; auto *VTracker = &vlocs[MBB->getNumber()]; @@ -3643,24 +3646,42 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, ScopeToVars[Scope].insert(Var); ScopeToBlocks[Scope].insert(VTracker->MBB); ScopeToDILocation[Scope] = ScopeLoc; + ++VarAssignCount; } } - // OK. Iterate over scopes: there might be something to be said for - // ordering them by size/locality, but that's for the future. For each scope, - // solve the variable value problem, producing a map of variables to values - // in SavedLiveIns. - for (auto &P : ScopeToVars) { - vlocDataflow(P.first, ScopeToDILocation[P.first], P.second, - ScopeToBlocks[P.first], SavedLiveIns, MOutLocs, MInLocs, - vlocs); - } + bool Changed = false; + + // If we have an extremely large number of variable assignments and blocks, + // bail out at this point. We've burnt some time doing analysis already, + // however we should cut our losses. + if ((unsigned)MaxNumBlocks > InputBBLimit && + VarAssignCount > InputDbgValLimit) { + LLVM_DEBUG(dbgs() << "Disabling InstrRefBasedLDV: " << MF.getName() + << " has " << MaxNumBlocks << " basic blocks and " + << VarAssignCount + << " variable assignments, exceeding limits.\n"); + } else { + // Compute the extended ranges, iterating over scopes. There might be + // something to be said for ordering them by size/locality, but that's for + // the future. For each scope, solve the variable value problem, producing + // a map of variables to values in SavedLiveIns. + for (auto &P : ScopeToVars) { + vlocDataflow(P.first, ScopeToDILocation[P.first], P.second, + ScopeToBlocks[P.first], SavedLiveIns, MOutLocs, MInLocs, + vlocs); + } + + // Using the computed value locations and variable values for each block, + // create the DBG_VALUE instructions representing the extended variable + // locations. + emitLocations(MF, SavedLiveIns, MOutLocs, MInLocs, AllVarsNumbering, *TPC); - // Using the computed value locations and variable values for each block, - // create the DBG_VALUE instructions representing the extended variable - // locations. - emitLocations(MF, SavedLiveIns, MOutLocs, MInLocs, AllVarsNumbering, *TPC); + // Did we actually make any changes? If we created any DBG_VALUEs, then yes. + Changed = TTracker->Transfers.size() != 0; + } + // Common clean-up of memory. for (int Idx = 0; Idx < MaxNumBlocks; ++Idx) { delete[] MOutLocs[Idx]; delete[] MInLocs[Idx]; @@ -3668,9 +3689,6 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, delete[] MOutLocs; delete[] MInLocs; - // Did we actually make any changes? If we created any DBG_VALUEs, then yes. - bool Changed = TTracker->Transfers.size() != 0; - delete MTracker; delete TTracker; MTracker = nullptr; diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp index 38e803d1abb55..bc1eaff60440f 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp @@ -40,6 +40,18 @@ static cl::opt "normal DBG_VALUE inputs"), cl::init(false)); +// Options to prevent pathological compile-time behavior. If InputBBLimit and +// InputDbgValueLimit are both exceeded, range extension is disabled. +static cl::opt InputBBLimit( + "livedebugvalues-input-bb-limit", + cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"), + cl::init(10000), cl::Hidden); +static cl::opt InputDbgValueLimit( + "livedebugvalues-input-dbg-value-limit", + cl::desc( + "Maximum input DBG_VALUE insts supported by debug range extension"), + cl::init(50000), cl::Hidden); + /// Generic LiveDebugValues pass. Calls through to VarLocBasedLDV or /// InstrRefBasedLDV to perform location propagation, via the LDVImpl /// base class. @@ -103,5 +115,5 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) { TheImpl = llvm::makeVarLocBasedLiveDebugValues(); } - return TheImpl->ExtendRanges(MF, TPC); + return TheImpl->ExtendRanges(MF, TPC, InputBBLimit, InputDbgValueLimit); } diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h index 9c910f180b9fb..e38360b08bafa 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h +++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h @@ -23,7 +23,9 @@ inline namespace SharedLiveDebugValues { // implementation. class LDVImpl { public: - virtual bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) = 0; + virtual bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, + unsigned InputBBLimit, + unsigned InputDbgValLimit) = 0; virtual ~LDVImpl() {} }; diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp index 1e6d65c189535..977d3ede5c776 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp @@ -166,18 +166,6 @@ using namespace llvm; STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted"); -// Options to prevent pathological compile-time behavior. If InputBBLimit and -// InputDbgValueLimit are both exceeded, range extension is disabled. -static cl::opt InputBBLimit( - "livedebugvalues-input-bb-limit", - cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"), - cl::init(10000), cl::Hidden); -static cl::opt InputDbgValueLimit( - "livedebugvalues-input-dbg-value-limit", - cl::desc( - "Maximum input DBG_VALUE insts supported by debug range extension"), - cl::init(50000), cl::Hidden); - /// If \p Op is a stack or frame register return true, otherwise return false. /// This is used to avoid basing the debug entry values on the registers, since /// we do not support it at the moment. @@ -1007,7 +995,8 @@ class VarLocBasedLDV : public LDVImpl { /// had their instruction creation deferred. void flushPendingLocs(VarLocInMBB &PendingInLocs, VarLocMap &VarLocIDs); - bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) override; + bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, + unsigned InputBBLimit, unsigned InputDbgValLimit) override; public: /// Default construct and initialize the pass. @@ -2048,7 +2037,9 @@ void VarLocBasedLDV::recordEntryValue(const MachineInstr &MI, /// Calculate the liveness information for the given machine function and /// extend ranges across basic blocks. -bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) { +bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC, + unsigned InputBBLimit, + unsigned InputDbgValLimit) { LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n"); if (!MF.getFunction().getSubprogram()) @@ -2141,7 +2132,7 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) { for (auto &MI : MBB) if (MI.isDebugValue()) ++NumInputDbgValues; - if (NumInputDbgValues > InputDbgValueLimit) { + if (NumInputDbgValues > InputDbgValLimit) { LLVM_DEBUG(dbgs() << "Disabling VarLocBasedLDV: " << MF.getName() << " has " << RPONumber << " basic blocks and " << NumInputDbgValues diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir index 4922c36086f16..17b6b9b3149c3 100644 --- a/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir +++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir @@ -5,21 +5,41 @@ # RUN: -livedebugvalues-input-bb-limit=1 \ # RUN: -livedebugvalues-input-dbg-value-limit=1 \ # RUN: | FileCheck %s -check-prefix=LDV-DISABLED +# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ +# RUN: -experimental-debug-variable-locations \ +# RUN: -livedebugvalues-input-bb-limit=1 \ +# RUN: -livedebugvalues-input-dbg-value-limit=1 \ +# RUN: | FileCheck %s -check-prefix=LDV-DISABLED # RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ # RUN: -livedebugvalues-input-bb-limit=1 \ # RUN: -livedebugvalues-input-dbg-value-limit=10 \ # RUN: | FileCheck %s -check-prefix=LDV-ENABLED +# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ +# RUN: -experimental-debug-variable-locations \ +# RUN: -livedebugvalues-input-bb-limit=1 \ +# RUN: -livedebugvalues-input-dbg-value-limit=10 \ +# RUN: | FileCheck %s -check-prefix=LDV-ENABLED # RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ # RUN: -livedebugvalues-input-bb-limit=10 \ # RUN: -livedebugvalues-input-dbg-value-limit=1 \ # RUN: | FileCheck %s -check-prefix=LDV-ENABLED +# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ +# RUN: -experimental-debug-variable-locations \ +# RUN: -livedebugvalues-input-bb-limit=10 \ +# RUN: -livedebugvalues-input-dbg-value-limit=1 \ +# RUN: | FileCheck %s -check-prefix=LDV-ENABLED # RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ # RUN: -livedebugvalues-input-bb-limit=10 \ # RUN: -livedebugvalues-input-dbg-value-limit=10 \ # RUN: | FileCheck %s -check-prefix=LDV-ENABLED +# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \ +# RUN: -experimental-debug-variable-locations \ +# RUN: -livedebugvalues-input-bb-limit=10 \ +# RUN: -livedebugvalues-input-dbg-value-limit=10 \ +# RUN: | FileCheck %s -check-prefix=LDV-ENABLED # LDV-DISABLED-LABEL: bb.1.exit # LDV-DISABLED-NEXT: $edi = MOV32rm From 9cfa9b44a589438d3c6920881c5619c76479dbaa Mon Sep 17 00:00:00 2001 From: Tiehu Zhang Date: Tue, 17 Aug 2021 18:50:54 +0800 Subject: [PATCH 197/700] [CodeGenPrepare] The instruction to be sunk should be inserted before its user in a block In current implementation, the instruction to be sunk will be inserted before the target instruction without considering the def-use tree, which may case Instruction does not dominate all uses error. We need to choose a suitable location to insert according to the use chain Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D107262 --- llvm/lib/CodeGen/CodeGenPrepare.cpp | 14 +- .../sink-free-instructions-inseltpoison.ll | 32 ++--- .../AArch64/sink-free-instructions.ll | 124 +++++++++++++++--- .../sink-free-instructions-inseltpoison.ll | 12 +- .../ARM/sink-free-instructions.ll | 12 +- 5 files changed, 148 insertions(+), 46 deletions(-) diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 77ce3d2fb5633..50e59399d2a72 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -6950,16 +6950,26 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { BasicBlock *TargetBB = I->getParent(); bool Changed = false; SmallVector ToReplace; + Instruction *InsertPoint = I; + DenseMap InstOrdering; + unsigned long InstNumber = 0; + for (const auto &I : *TargetBB) + InstOrdering[&I] = InstNumber++; + for (Use *U : reverse(OpsToSink)) { auto *UI = cast(U->get()); - if (UI->getParent() == TargetBB || isa(UI)) + if (isa(UI)) + continue; + if (UI->getParent() == TargetBB) { + if (InstOrdering[UI] < InstOrdering[InsertPoint]) + InsertPoint = UI; continue; + } ToReplace.push_back(U); } SetVector MaybeDead; DenseMap NewInstructions; - Instruction *InsertPoint = I; for (Use *U : ToReplace) { auto *UI = cast(U->get()); Instruction *NI = UI->clone(); diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions-inseltpoison.ll index b7804468d9581..04c94eb187ef9 100644 --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions-inseltpoison.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions-inseltpoison.ll @@ -9,13 +9,13 @@ define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] ; CHECK-NEXT: ret <8 x i16> [[RES_1]] ; CHECK: if.else: -; CHECK-NEXT: [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +; CHECK-NEXT: [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16> ; CHECK-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]] ; CHECK-NEXT: ret <8 x i16> [[RES_2]] ; @@ -39,13 +39,13 @@ define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] ; CHECK-NEXT: ret <8 x i16> [[RES_1]] ; CHECK: if.else: -; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> ; CHECK-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]] ; CHECK-NEXT: ret <8 x i16> [[RES_2]] ; @@ -69,8 +69,8 @@ define <8 x i16> @do_not_sink_nonfree_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] ; CHECK-NEXT: ret <8 x i16> [[RES_1]] ; CHECK: if.else: @@ -96,8 +96,8 @@ define <8 x i16> @do_not_sink_nonfree_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] ; CHECK-NEXT: ret <8 x i16> [[RES_1]] ; CHECK: if.else: @@ -124,13 +124,13 @@ define <8 x i16> @sink_shufflevector_umull(<16 x i8> %a, <16 x i8> %b) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> ; CHECK-NEXT: [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]]) ; CHECK-NEXT: ret <8 x i16> [[VMULL0]] ; CHECK: if.else: -; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> ; CHECK-NEXT: [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]]) ; CHECK-NEXT: ret <8 x i16> [[VMULL1]] ; @@ -156,17 +156,17 @@ define <8 x i16> @sink_shufflevector_ext_subadd(<16 x i8> %a, <16 x i8> %b) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> -; CHECK-NEXT: [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16> ; CHECK-NEXT: [[RES1:%.*]] = add <8 x i16> [[TMP1]], [[Z2]] ; CHECK-NEXT: ret <8 x i16> [[RES1]] ; CHECK: if.else: -; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> -; CHECK-NEXT: [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16> ; CHECK-NEXT: [[RES2:%.*]] = sub <8 x i16> [[TMP3]], [[Z4]] ; CHECK-NEXT: ret <8 x i16> [[RES2]] ; @@ -202,17 +202,17 @@ define <8 x i16> @sink_shufflevector_ext_subadd_multiuse(<16 x i8> %a, <16 x i8> ; CHECK-NEXT: call void @user1(<8 x i16> [[Z3]]) ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> -; CHECK-NEXT: [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16> ; CHECK-NEXT: [[RES1:%.*]] = add <8 x i16> [[TMP1]], [[Z2]] ; CHECK-NEXT: ret <8 x i16> [[RES1]] ; CHECK: if.else: -; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> -; CHECK-NEXT: [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16> ; CHECK-NEXT: [[RES2:%.*]] = sub <8 x i16> [[TMP3]], [[Z4]] ; CHECK-NEXT: ret <8 x i16> [[RES2]] ; diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll index 35221e2ae60f8..5cc1af7193692 100644 --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll @@ -9,13 +9,13 @@ define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] ; CHECK-NEXT: ret <8 x i16> [[RES_1]] ; CHECK: if.else: -; CHECK-NEXT: [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +; CHECK-NEXT: [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16> ; CHECK-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]] ; CHECK-NEXT: ret <8 x i16> [[RES_2]] ; @@ -39,13 +39,13 @@ define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] ; CHECK-NEXT: ret <8 x i16> [[RES_1]] ; CHECK: if.else: -; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> ; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> ; CHECK-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]] ; CHECK-NEXT: ret <8 x i16> [[RES_2]] ; @@ -69,8 +69,8 @@ define <8 x i16> @do_not_sink_nonfree_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] ; CHECK-NEXT: ret <8 x i16> [[RES_1]] ; CHECK: if.else: @@ -96,8 +96,8 @@ define <8 x i16> @do_not_sink_nonfree_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] ; CHECK-NEXT: ret <8 x i16> [[RES_1]] ; CHECK: if.else: @@ -124,13 +124,13 @@ define <8 x i16> @sink_shufflevector_umull(<16 x i8> %a, <16 x i8> %b) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> ; CHECK-NEXT: [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]]) ; CHECK-NEXT: ret <8 x i16> [[VMULL0]] ; CHECK: if.else: -; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> ; CHECK-NEXT: [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]]) ; CHECK-NEXT: ret <8 x i16> [[VMULL1]] ; @@ -156,17 +156,17 @@ define <8 x i16> @sink_shufflevector_ext_subadd(<16 x i8> %a, <16 x i8> %b) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> -; CHECK-NEXT: [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16> ; CHECK-NEXT: [[RES1:%.*]] = add <8 x i16> [[TMP1]], [[Z2]] ; CHECK-NEXT: ret <8 x i16> [[RES1]] ; CHECK: if.else: -; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> -; CHECK-NEXT: [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16> ; CHECK-NEXT: [[RES2:%.*]] = sub <8 x i16> [[TMP3]], [[Z4]] ; CHECK-NEXT: ret <8 x i16> [[RES2]] ; @@ -202,17 +202,17 @@ define <8 x i16> @sink_shufflevector_ext_subadd_multiuse(<16 x i8> %a, <16 x i8> ; CHECK-NEXT: call void @user1(<8 x i16> [[Z3]]) ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> -; CHECK-NEXT: [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16> ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16> ; CHECK-NEXT: [[RES1:%.*]] = add <8 x i16> [[TMP1]], [[Z2]] ; CHECK-NEXT: ret <8 x i16> [[RES1]] ; CHECK: if.else: -; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> -; CHECK-NEXT: [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16> ; CHECK-NEXT: [[RES2:%.*]] = sub <8 x i16> [[TMP3]], [[Z4]] ; CHECK-NEXT: ret <8 x i16> [[RES2]] ; @@ -273,3 +273,95 @@ if.else: ; Function Attrs: nounwind readnone declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) #2 + +; The insertelement should be inserted before shufflevector, otherwise 'does not dominate all uses' error will occur. +define <4 x i32> @sink_insertelement(i16 %e, i8 %f) { +; CHECK-LABEL: @sink_insertelement( +; CHECK-NEXT: for.cond4.preheader.lr.ph: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[F:%.*]], 0 +; CHECK-NEXT: [[CONV25:%.*]] = sext i16 [[E:%.*]] to i32 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND4_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND4_PREHEADER_PREHEADER:%.*]] +; CHECK: for.cond4.preheader.us.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV25]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT144:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> zeroinitializer, [[BROADCAST_SPLAT144]] +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; CHECK: for.cond4.preheader.preheader: +; CHECK-NEXT: ret <4 x i32> zeroinitializer +; +for.cond4.preheader.lr.ph: + %cmp = icmp slt i8 %f, 0 + %conv25 = sext i16 %e to i32 + %broadcast.splatinsert143 = insertelement <4 x i32> poison, i32 %conv25, i32 0 + br i1 %cmp, label %for.cond4.preheader.us.preheader, label %for.cond4.preheader.preheader + +for.cond4.preheader.us.preheader: ; preds = %for.cond4.preheader.lr.ph + %broadcast.splat144 = shufflevector <4 x i32> %broadcast.splatinsert143, <4 x i32> poison, <4 x i32> zeroinitializer + %0 = mul <4 x i32> zeroinitializer, %broadcast.splat144 + ret <4 x i32> %0 + +for.cond4.preheader.preheader: ; preds = %for.cond4.preheader.lr.ph + ret <4 x i32> zeroinitializer +} + +define <4 x i32> @sinkadd_partial(<8 x i16> %a1, <8 x i16> %a2, i8 %f) { +; CHECK-LABEL: @sinkadd_partial( +; CHECK-NEXT: for.cond4.preheader.lr.ph: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[F:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND4_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND4_PREHEADER_PREHEADER:%.*]] +; CHECK: for.cond4.preheader.us.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[A1:%.*]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A2:%.*]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[E1:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[E2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[E1]], [[E2]] +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; CHECK: for.cond4.preheader.preheader: +; CHECK-NEXT: ret <4 x i32> zeroinitializer +; +for.cond4.preheader.lr.ph: + %cmp = icmp slt i8 %f, 0 + %s2 = shufflevector <8 x i16> %a2, <8 x i16> poison, <4 x i32> + %s1 = shufflevector <8 x i16> %a1, <8 x i16> poison, <4 x i32> + br i1 %cmp, label %for.cond4.preheader.us.preheader, label %for.cond4.preheader.preheader + +for.cond4.preheader.us.preheader: ; preds = %for.cond4.preheader.lr.ph + %e1 = sext <4 x i16> %s1 to <4 x i32> + %e2 = sext <4 x i16> %s2 to <4 x i32> + %0 = add <4 x i32> %e1, %e2 + ret <4 x i32> %0 + +for.cond4.preheader.preheader: ; preds = %for.cond4.preheader.lr.ph + ret <4 x i32> zeroinitializer +} + +define <4 x i32> @sinkadd_partial_rev(<8 x i16> %a1, <8 x i16> %a2, i8 %f) { +; CHECK-LABEL: @sinkadd_partial_rev( +; CHECK-NEXT: for.cond4.preheader.lr.ph: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i8 [[F:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND4_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND4_PREHEADER_PREHEADER:%.*]] +; CHECK: for.cond4.preheader.us.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i16> [[A1:%.*]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A2:%.*]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[E2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[E1:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[E1]], [[E2]] +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; CHECK: for.cond4.preheader.preheader: +; CHECK-NEXT: ret <4 x i32> zeroinitializer +; +for.cond4.preheader.lr.ph: + %cmp = icmp slt i8 %f, 0 + %s2 = shufflevector <8 x i16> %a2, <8 x i16> poison, <4 x i32> + %s1 = shufflevector <8 x i16> %a1, <8 x i16> poison, <4 x i32> + br i1 %cmp, label %for.cond4.preheader.us.preheader, label %for.cond4.preheader.preheader + +for.cond4.preheader.us.preheader: ; preds = %for.cond4.preheader.lr.ph + %e2 = sext <4 x i16> %s2 to <4 x i32> + %e1 = sext <4 x i16> %s1 to <4 x i32> + %0 = add <4 x i32> %e1, %e2 + ret <4 x i32> %0 + +for.cond4.preheader.preheader: ; preds = %for.cond4.preheader.lr.ph + ret <4 x i32> zeroinitializer +} diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions-inseltpoison.ll index fbaf7bfd85ed7..bc813d75e770a 100644 --- a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions-inseltpoison.ll +++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions-inseltpoison.ll @@ -7,13 +7,13 @@ define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) { ; NEON-NEXT: entry: ; NEON-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; NEON: if.then: -; NEON-NEXT: [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16> ; NEON-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16> +; NEON-NEXT: [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16> ; NEON-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] ; NEON-NEXT: ret <8 x i16> [[RES_1]] ; NEON: if.else: -; NEON-NEXT: [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16> ; NEON-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +; NEON-NEXT: [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16> ; NEON-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]] ; NEON-NEXT: ret <8 x i16> [[RES_2]] ; @@ -50,13 +50,13 @@ define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) { ; NEON-NEXT: entry: ; NEON-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; NEON: if.then: -; NEON-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; NEON-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; NEON-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; NEON-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] ; NEON-NEXT: ret <8 x i16> [[RES_1]] ; NEON: if.else: -; NEON-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> ; NEON-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +; NEON-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> ; NEON-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]] ; NEON-NEXT: ret <8 x i16> [[RES_2]] ; @@ -180,14 +180,14 @@ define <8 x i16> @sink_shufflevector_ext_subadd_multiuse(<16 x i8> %a, <16 x i8> ; NEON-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; NEON: if.then: ; NEON-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> -; NEON-NEXT: [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16> ; NEON-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[S1]] to <8 x i16> +; NEON-NEXT: [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16> ; NEON-NEXT: [[RES1:%.*]] = add <8 x i16> [[TMP0]], [[Z2]] ; NEON-NEXT: ret <8 x i16> [[RES1]] ; NEON: if.else: ; NEON-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> -; NEON-NEXT: [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16> ; NEON-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[S3]] to <8 x i16> +; NEON-NEXT: [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16> ; NEON-NEXT: [[RES2:%.*]] = sub <8 x i16> [[TMP1]], [[Z4]] ; NEON-NEXT: ret <8 x i16> [[RES2]] ; diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions.ll index 9dd0b373aa2e1..990fb8034ce9b 100644 --- a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions.ll +++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions.ll @@ -7,13 +7,13 @@ define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) { ; NEON-NEXT: entry: ; NEON-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; NEON: if.then: -; NEON-NEXT: [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16> ; NEON-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16> +; NEON-NEXT: [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16> ; NEON-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] ; NEON-NEXT: ret <8 x i16> [[RES_1]] ; NEON: if.else: -; NEON-NEXT: [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16> ; NEON-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +; NEON-NEXT: [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16> ; NEON-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]] ; NEON-NEXT: ret <8 x i16> [[RES_2]] ; @@ -50,13 +50,13 @@ define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) { ; NEON-NEXT: entry: ; NEON-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; NEON: if.then: -; NEON-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; NEON-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16> +; NEON-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16> ; NEON-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]] ; NEON-NEXT: ret <8 x i16> [[RES_1]] ; NEON: if.else: -; NEON-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> ; NEON-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +; NEON-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16> ; NEON-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]] ; NEON-NEXT: ret <8 x i16> [[RES_2]] ; @@ -180,14 +180,14 @@ define <8 x i16> @sink_shufflevector_ext_subadd_multiuse(<16 x i8> %a, <16 x i8> ; NEON-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; NEON: if.then: ; NEON-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> -; NEON-NEXT: [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16> ; NEON-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[S1]] to <8 x i16> +; NEON-NEXT: [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16> ; NEON-NEXT: [[RES1:%.*]] = add <8 x i16> [[TMP0]], [[Z2]] ; NEON-NEXT: ret <8 x i16> [[RES1]] ; NEON: if.else: ; NEON-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> -; NEON-NEXT: [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16> ; NEON-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[S3]] to <8 x i16> +; NEON-NEXT: [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16> ; NEON-NEXT: [[RES2:%.*]] = sub <8 x i16> [[TMP1]], [[Z4]] ; NEON-NEXT: ret <8 x i16> [[RES2]] ; From fc5495c351a1f7ce28c7166a70113ce45906ff7b Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Tue, 17 Aug 2021 12:46:44 +0200 Subject: [PATCH 198/700] [lldb] Make TestAArch64AdrpAdd depend on the AArch64 target LLDB is using LLVM's target-specific disassembler which is only available when the respective LLVM target has been enabled in the build config. This patch just skips the test if there is no arm64 target (and its disassembler) available in the current build config. Reviewed By: jasonmolenda Differential Revision: https://reviews.llvm.org/D108145 --- .../disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/test/API/functionalities/disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py b/lldb/test/API/functionalities/disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py index 325607028c033..c5b9921d7e62d 100644 --- a/lldb/test/API/functionalities/disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py +++ b/lldb/test/API/functionalities/disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py @@ -11,6 +11,7 @@ class TestAArch64AdrpAdd(TestBase): mydir = TestBase.compute_mydir(__file__) @no_debug_info_test + @skipIfLLVMTargetMissing("AArch64") def test_arm64(self): src_dir = self.getSourceDir() yaml_path = os.path.join(src_dir, "a.out-arm64.yaml") @@ -27,6 +28,7 @@ def test_arm64(self): self.disassemble_check_for_hi_and_foo(target, f, binaryname) @no_debug_info_test + @skipIfLLVMTargetMissing("AArch64") def test_arm64_32(self): src_dir = self.getSourceDir() yaml_path = os.path.join(src_dir, "a.out-arm64_32.yaml") From fbae34635d83c106f99ccd11a53305915929bb9a Mon Sep 17 00:00:00 2001 From: Sebastian Neubauer Date: Tue, 17 Aug 2021 13:58:16 +0200 Subject: [PATCH 199/700] [GlobalISel] Add combine for PTR_ADD with regbanks Combine two G_PTR_ADDs, but keep the register bank of the constant. That way, the combine can be used in post-regbank-select combines. Introduce two helper methods in CombinerHelper, getRegBank and setRegBank that get and set an optional register bank to a register. That way, they can be used before and after register bank selection. Differential Revision: https://reviews.llvm.org/D103326 --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 18 + .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 21 +- llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 2 +- .../GlobalISel/extractelement-stack-lower.ll | 1170 +++++++++-------- .../AMDGPU/GlobalISel/extractelement.ll | 21 +- .../AMDGPU/GlobalISel/insertelement.large.ll | 185 ++- 6 files changed, 732 insertions(+), 685 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 8855631859fcf..d892a7525a6d3 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -36,7 +36,10 @@ class GISelKnownBits; class MachineDominatorTree; class LegalizerInfo; struct LegalityQuery; +class RegisterBank; +class RegisterBankInfo; class TargetLowering; +class TargetRegisterInfo; struct PreferredTuple { LLT Ty; // The result type of the extend. @@ -54,6 +57,7 @@ struct IndexedLoadStoreMatchInfo { struct PtrAddChain { int64_t Imm; Register Base; + const RegisterBank *Bank; }; struct RegisterImmPair { @@ -95,6 +99,8 @@ class CombinerHelper { GISelKnownBits *KB; MachineDominatorTree *MDT; const LegalizerInfo *LI; + const RegisterBankInfo *RBI; + const TargetRegisterInfo *TRI; public: CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B, @@ -120,6 +126,18 @@ class CombinerHelper { void replaceRegOpWith(MachineRegisterInfo &MRI, MachineOperand &FromRegOp, Register ToReg) const; + /// Get the register bank of \p Reg. + /// If Reg has not been assigned a register, a register class, + /// or a register bank, then this returns nullptr. + /// + /// \pre Reg.isValid() + const RegisterBank *getRegBank(Register Reg) const; + + /// Set the register bank of \p Reg. + /// Does nothing if the RegBank is null. + /// This is the counterpart to getRegBank. + void setRegBank(Register Reg, const RegisterBank *RegBank); + /// If \p MI is COPY, try to combine it. /// Returns true if MI changed. bool tryCombineCopy(MachineInstr &MI); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 732b7ed5dd9d6..949ecacbffd90 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -15,6 +15,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -46,8 +47,9 @@ CombinerHelper::CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B, GISelKnownBits *KB, MachineDominatorTree *MDT, const LegalizerInfo *LI) - : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer), - KB(KB), MDT(MDT), LI(LI) { + : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer), KB(KB), + MDT(MDT), LI(LI), RBI(Builder.getMF().getSubtarget().getRegBankInfo()), + TRI(Builder.getMF().getSubtarget().getRegisterInfo()) { (void)this->KB; } @@ -143,6 +145,15 @@ void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI, Observer.changedInstr(*FromRegOp.getParent()); } +const RegisterBank *CombinerHelper::getRegBank(Register Reg) const { + return RBI->getRegBank(Reg, MRI, *TRI); +} + +void CombinerHelper::setRegBank(Register Reg, const RegisterBank *RegBank) { + if (RegBank) + MRI.setRegBank(Reg, *RegBank); +} + bool CombinerHelper::tryCombineCopy(MachineInstr &MI) { if (matchCombineCopy(MI)) { applyCombineCopy(MI); @@ -1407,7 +1418,6 @@ bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst, // Don't promote to an alignment that would require dynamic stack // realignment. - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!TRI->hasStackRealignment(MF)) while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign)) NewAlign = NewAlign / 2; @@ -1512,7 +1522,6 @@ bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst, // Don't promote to an alignment that would require dynamic stack // realignment. - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!TRI->hasStackRealignment(MF)) while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign)) NewAlign = NewAlign / 2; @@ -1710,7 +1719,7 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI, if (!MaybeImmVal) return false; - MachineInstr *Add2Def = MRI.getUniqueVRegDef(Add2); + MachineInstr *Add2Def = MRI.getVRegDef(Add2); if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD) return false; @@ -1751,6 +1760,7 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI, // Pass the combined immediate to the apply function. MatchInfo.Imm = AMNew.BaseOffs; MatchInfo.Base = Base; + MatchInfo.Bank = getRegBank(Imm2); return true; } @@ -1760,6 +1770,7 @@ void CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI, MachineIRBuilder MIB(MI); LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg()); auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm); + setRegBank(NewOffset.getReg(0), MatchInfo.Bank); Observer.changingInstr(MI); MI.getOperand(1).setReg(MatchInfo.Base); MI.getOperand(2).setReg(NewOffset.getReg(0)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index c6273adca50f7..28946435af467 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -83,7 +83,7 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< } def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3]> { + "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; let StateClass = "AMDGPURegBankCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll index 1b146ddc72b91..d4c1670b1c56d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -8,174 +8,191 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) { ; GCN-LABEL: v_extract_v64i32_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, s33 +; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v6, s5 -; GCN-NEXT: v_mov_b32_e32 v5, s4 -; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 -; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc -; GCN-NEXT: v_and_b32_e32 v0, 63, v2 -; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 -; GCN-NEXT: v_add_u32_e32 v0, v1, v0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[35:38], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[39:42], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[43:46], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:176 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:192 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[51:54], v[0:1], off offset:208 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[55:58], v[0:1], off offset:224 +; GCN-NEXT: global_load_dwordx4 v[59:62], v[0:1], off offset:240 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: v_and_b32_e32 v0, 63, v2 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1 +; GCN-NEXT: v_add_u32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v15 +; GCN-NEXT: v_mov_b32_e32 v13, v16 +; GCN-NEXT: v_mov_b32_e32 v14, v17 +; GCN-NEXT: v_mov_b32_e32 v15, v18 ; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -193,36 +210,39 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_mov_b32_e32 v4, v7 +; GCN-NEXT: v_mov_b32_e32 v5, v8 +; GCN-NEXT: v_mov_b32_e32 v6, v9 +; GCN-NEXT: v_mov_b32_e32 v7, v10 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b32 s33, s6 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr @@ -234,173 +254,190 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-LABEL: v_extract_v128i16_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, s33 +; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v6, s5 -; GCN-NEXT: v_mov_b32_e32 v5, s4 -; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 -; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc -; GCN-NEXT: v_bfe_u32 v0, v2, 1, 6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 1, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[35:38], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[39:42], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[43:46], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:176 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:192 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[51:54], v[0:1], off offset:208 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[55:58], v[0:1], off offset:224 +; GCN-NEXT: global_load_dwordx4 v[59:62], v[0:1], off offset:240 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: v_bfe_u32 v0, v2, 1, 6 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_and_b32_e32 v1, 1, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 +; GCN-NEXT: v_mov_b32_e32 v12, v15 +; GCN-NEXT: v_mov_b32_e32 v13, v16 +; GCN-NEXT: v_mov_b32_e32 v14, v17 +; GCN-NEXT: v_mov_b32_e32 v15, v18 ; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -418,40 +455,43 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 -; GCN-NEXT: v_lshrrev_b32_e64 v11, 6, s33 -; GCN-NEXT: v_add_u32_e32 v11, 0x100, v11 -; GCN-NEXT: v_add_u32_e32 v0, v11, v0 +; GCN-NEXT: v_mov_b32_e32 v4, v7 +; GCN-NEXT: v_mov_b32_e32 v5, v8 +; GCN-NEXT: v_mov_b32_e32 v6, v9 +; GCN-NEXT: v_mov_b32_e32 v7, v10 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_lshrrev_b32_e64 v7, 6, s33 +; GCN-NEXT: v_add_u32_e32 v7, 0x100, v7 +; GCN-NEXT: v_add_u32_e32 v0, v7, v0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b32 s33, s6 -; GCN-NEXT: s_waitcnt vmcnt(16) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(15) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -464,174 +504,191 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-LABEL: v_extract_v32i64_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, s33 +; GCN-NEXT: s_mov_b32 s4, s33 ; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, 64, v0 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v1, vcc -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v[3:4], off offset:48 -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s5 -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v4, vcc -; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off -; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:64 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:192 -; GCN-NEXT: global_load_dwordx4 v[44:47], v[3:4], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v[3:4], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[3:4], off offset:48 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v6, s5 -; GCN-NEXT: v_mov_b32_e32 v5, s4 -; GCN-NEXT: v_add_co_u32_e32 v60, vcc, v0, v5 -; GCN-NEXT: v_addc_co_u32_e32 v61, vcc, v1, v6, vcc -; GCN-NEXT: v_and_b32_e32 v0, 31, v2 -; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2 -; GCN-NEXT: v_add_u32_e32 v1, v2, v0 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[11:14], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[15:18], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[19:22], v[0:1], off offset:64 +; GCN-NEXT: global_load_dwordx4 v[23:26], v[0:1], off offset:80 +; GCN-NEXT: global_load_dwordx4 v[27:30], v[0:1], off offset:96 +; GCN-NEXT: global_load_dwordx4 v[31:34], v[0:1], off offset:112 +; GCN-NEXT: global_load_dwordx4 v[35:38], v[0:1], off offset:128 +; GCN-NEXT: global_load_dwordx4 v[39:42], v[0:1], off offset:144 +; GCN-NEXT: global_load_dwordx4 v[43:46], v[0:1], off offset:160 +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:176 ; GCN-NEXT: s_add_i32 s32, s32, 0x10000 ; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[47:50], v[0:1], off offset:192 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[4:7], v[60:61], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[48:51], v[60:61], off offset:32 +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[51:54], v[0:1], off offset:208 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[60:63], v[60:61], off offset:48 -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:260 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:264 -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:268 -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:272 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:276 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:280 -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:284 -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:288 -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:292 -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:296 -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:300 -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:304 -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:308 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:312 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:316 -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:320 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:324 -; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:328 -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:332 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:336 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:340 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:344 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:348 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:352 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:356 -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:360 -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:364 -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:368 -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:372 -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:376 -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:380 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:400 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:404 -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:408 -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:412 -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:416 -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:420 -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:424 -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:428 -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v12, v20 -; GCN-NEXT: v_mov_b32_e32 v13, v21 -; GCN-NEXT: v_mov_b32_e32 v14, v22 -; GCN-NEXT: v_mov_b32_e32 v15, v23 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[55:58], v[0:1], off offset:224 +; GCN-NEXT: global_load_dwordx4 v[59:62], v[0:1], off offset:240 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:260 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:264 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:268 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:272 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:276 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:280 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:284 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:288 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:292 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:296 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:300 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:304 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:308 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:312 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:316 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:320 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:324 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:328 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:332 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:336 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:340 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:344 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:348 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:352 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:356 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:360 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:364 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:368 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:372 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:376 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:380 +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:384 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:388 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:392 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:396 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:400 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:404 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:408 +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:412 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:416 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:420 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:424 +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:428 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: v_and_b32_e32 v0, 31, v2 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2 +; GCN-NEXT: v_add_u32_e32 v1, v2, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v12, v15 +; GCN-NEXT: v_mov_b32_e32 v13, v16 +; GCN-NEXT: v_mov_b32_e32 v14, v17 +; GCN-NEXT: v_mov_b32_e32 v15, v18 ; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:432 ; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:436 ; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:440 ; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:444 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:448 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:452 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:456 -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:460 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:448 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:452 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:456 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:460 ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload @@ -649,37 +706,40 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) { ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v11 -; GCN-NEXT: v_mov_b32_e32 v9, v12 -; GCN-NEXT: v_mov_b32_e32 v10, v13 -; GCN-NEXT: v_mov_b32_e32 v11, v14 -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480 -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484 -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492 -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:496 -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:500 -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:504 -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:508 +; GCN-NEXT: v_mov_b32_e32 v4, v7 +; GCN-NEXT: v_mov_b32_e32 v5, v8 +; GCN-NEXT: v_mov_b32_e32 v6, v9 +; GCN-NEXT: v_mov_b32_e32 v7, v10 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:476 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:480 +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:484 +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:488 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:492 +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:496 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:500 +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:504 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:508 ; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b32 s33, s6 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <32 x i64>, <32 x i64> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 2dedb531bc1bb..548debc54788b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -4138,12 +4138,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) { ; GPRIDX-LABEL: v_extract_v64i32_37: ; GPRIDX: ; %bb.0: ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b64 s[4:5], 0x80 -; GPRIDX-NEXT: v_mov_b32_e32 v2, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s5 -; GPRIDX-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GPRIDX-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GPRIDX-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GPRIDX-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:144 ; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: v_mov_b32_e32 v0, v5 ; GPRIDX-NEXT: s_setpc_b64 s[30:31] @@ -4151,12 +4146,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) { ; MOVREL-LABEL: v_extract_v64i32_37: ; MOVREL: ; %bb.0: ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; MOVREL-NEXT: s_mov_b64 s[4:5], 0x80 -; MOVREL-NEXT: v_mov_b32_e32 v2, s4 -; MOVREL-NEXT: v_mov_b32_e32 v3, s5 -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x90, v0 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; MOVREL-NEXT: s_waitcnt vmcnt(0) @@ -4167,12 +4157,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b64 s[4:5], 0x80 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:144 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll index 04120780f8e19..e2e5e3369bd9a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -6,123 +6,96 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, ; GCN-LABEL: v_insert_v64i32_37: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v68, 8, v0 -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v0, v68 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v2 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:48 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_add_co_u32_e32 v64, vcc, v2, v0 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_addc_co_u32_e32 v65, vcc, v3, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_add_co_u32_e32 v66, vcc, v2, v0 -; GCN-NEXT: v_addc_co_u32_e32 v67, vcc, v3, v1, vcc -; GCN-NEXT: global_load_dwordx4 v[44:47], v68, s[0:1] -; GCN-NEXT: global_load_dwordx4 v[48:51], v68, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v68, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v68, s[0:1] offset:48 -; GCN-NEXT: global_load_dwordx4 v[60:63], v68, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[4:7], v[64:65], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[64:65], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[64:65], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[66:67], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[66:67], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[66:67], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[0:3], v68, s[0:1] offset:128 -; GCN-NEXT: global_load_dwordx4 v[16:19], v68, s[0:1] offset:192 -; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_load_dwordx4 v[32:35], v64, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[36:39], v64, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[40:43], v64, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:80 +; GCN-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:96 +; GCN-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 +; GCN-NEXT: global_load_dwordx4 v[4:7], v64, s[0:1] offset:144 +; GCN-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:160 +; GCN-NEXT: global_load_dwordx4 v[12:15], v64, s[0:1] offset:176 +; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 +; GCN-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:208 +; GCN-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:224 +; GCN-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:240 +; GCN-NEXT: s_waitcnt vmcnt(6) ; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: global_store_dwordx4 v68, v[0:3], s[2:3] offset:128 -; GCN-NEXT: global_store_dwordx4 v68, v[4:7], s[2:3] offset:144 -; GCN-NEXT: global_store_dwordx4 v68, v[8:11], s[2:3] offset:160 -; GCN-NEXT: global_store_dwordx4 v68, v[12:15], s[2:3] offset:176 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: global_store_dwordx4 v68, v[16:19], s[2:3] offset:192 -; GCN-NEXT: global_store_dwordx4 v68, v[20:23], s[2:3] offset:208 -; GCN-NEXT: global_store_dwordx4 v68, v[24:27], s[2:3] offset:224 -; GCN-NEXT: global_store_dwordx4 v68, v[44:47], s[2:3] -; GCN-NEXT: global_store_dwordx4 v68, v[48:51], s[2:3] offset:16 -; GCN-NEXT: global_store_dwordx4 v68, v[52:55], s[2:3] offset:32 -; GCN-NEXT: global_store_dwordx4 v68, v[56:59], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v68, v[60:63], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v68, v[28:31], s[2:3] offset:240 -; GCN-NEXT: global_store_dwordx4 v68, v[32:35], s[2:3] offset:80 -; GCN-NEXT: global_store_dwordx4 v68, v[36:39], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v68, v[40:43], s[2:3] offset:112 +; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 +; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[2:3] offset:144 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:160 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:176 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:192 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:208 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:224 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 +; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] +; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:112 ; GCN-NEXT: s_endpgm ; ; GFX10-LABEL: v_insert_v64i32_37: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v70, 8, v0 -; GFX10-NEXT: s_movk_i32 s4, 0x80 -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: s_movk_i32 s4, 0xc0 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: v_mov_b32_e32 v4, s5 +; GFX10-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v6, s1 -; GFX10-NEXT: v_mov_b32_e32 v5, s0 -; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: global_load_dwordx4 v[32:35], v70, s[0:1] -; GFX10-NEXT: global_load_dwordx4 v[36:39], v70, s[0:1] offset:16 -; GFX10-NEXT: global_load_dwordx4 v[40:43], v70, s[0:1] offset:32 -; GFX10-NEXT: global_load_dwordx4 v[44:47], v70, s[0:1] offset:48 -; GFX10-NEXT: global_load_dwordx4 v[48:51], v70, s[0:1] offset:64 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v5, v70 -; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo -; GFX10-NEXT: v_add_co_u32 v64, vcc_lo, v0, 64 -; GFX10-NEXT: v_add_co_ci_u32_e32 v65, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v66, vcc_lo, v0, v1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v67, vcc_lo, v5, v2, vcc_lo -; GFX10-NEXT: v_add_co_u32 v68, vcc_lo, v0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v69, vcc_lo, v5, v4, vcc_lo -; GFX10-NEXT: s_clause 0xa -; GFX10-NEXT: global_load_dwordx4 v[52:55], v[64:65], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[56:59], v[64:65], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[60:63], v[64:65], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[4:7], v[66:67], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[8:11], v[66:67], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[12:15], v[66:67], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[20:23], v[68:69], off offset:16 -; GFX10-NEXT: global_load_dwordx4 v[24:27], v[68:69], off offset:32 -; GFX10-NEXT: global_load_dwordx4 v[28:31], v[68:69], off offset:48 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v70, s[0:1] offset:128 -; GFX10-NEXT: global_load_dwordx4 v[16:19], v70, s[0:1] offset:192 -; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: s_clause 0xf +; GFX10-NEXT: global_load_dwordx4 v[32:35], v64, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[36:39], v64, s[0:1] offset:16 +; GFX10-NEXT: global_load_dwordx4 v[40:43], v64, s[0:1] offset:32 +; GFX10-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] offset:48 +; GFX10-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:80 +; GFX10-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:96 +; GFX10-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:112 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v64, s[0:1] offset:144 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v64, s[0:1] offset:160 +; GFX10-NEXT: global_load_dwordx4 v[12:15], v64, s[0:1] offset:176 +; GFX10-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 +; GFX10-NEXT: global_load_dwordx4 v[20:23], v64, s[0:1] offset:208 +; GFX10-NEXT: global_load_dwordx4 v[24:27], v64, s[0:1] offset:224 +; GFX10-NEXT: global_load_dwordx4 v[28:31], v64, s[0:1] offset:240 +; GFX10-NEXT: s_waitcnt vmcnt(6) ; GFX10-NEXT: v_mov_b32_e32 v5, 0x3e7 +; GFX10-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 +; GFX10-NEXT: global_store_dwordx4 v64, v[4:7], s[2:3] offset:144 +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:160 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:176 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:192 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:208 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: global_store_dwordx4 v70, v[0:3], s[2:3] offset:128 -; GFX10-NEXT: global_store_dwordx4 v70, v[4:7], s[2:3] offset:144 -; GFX10-NEXT: global_store_dwordx4 v70, v[8:11], s[2:3] offset:160 -; GFX10-NEXT: global_store_dwordx4 v70, v[12:15], s[2:3] offset:176 +; GFX10-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:224 +; GFX10-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] +; GFX10-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:16 +; GFX10-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:32 +; GFX10-NEXT: global_store_dwordx4 v64, v[44:47], s[2:3] offset:48 +; GFX10-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:64 +; GFX10-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:80 +; GFX10-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:96 +; GFX10-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:112 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v70, v[16:19], s[2:3] offset:192 -; GFX10-NEXT: global_store_dwordx4 v70, v[20:23], s[2:3] offset:208 -; GFX10-NEXT: global_store_dwordx4 v70, v[24:27], s[2:3] offset:224 -; GFX10-NEXT: global_store_dwordx4 v70, v[32:35], s[2:3] -; GFX10-NEXT: global_store_dwordx4 v70, v[36:39], s[2:3] offset:16 -; GFX10-NEXT: global_store_dwordx4 v70, v[40:43], s[2:3] offset:32 -; GFX10-NEXT: global_store_dwordx4 v70, v[44:47], s[2:3] offset:48 -; GFX10-NEXT: global_store_dwordx4 v70, v[48:51], s[2:3] offset:64 -; GFX10-NEXT: global_store_dwordx4 v70, v[52:55], s[2:3] offset:80 -; GFX10-NEXT: global_store_dwordx4 v70, v[56:59], s[2:3] offset:96 -; GFX10-NEXT: global_store_dwordx4 v70, v[60:63], s[2:3] offset:112 -; GFX10-NEXT: global_store_dwordx4 v70, v[28:31], s[2:3] offset:240 +; GFX10-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 ; GFX10-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id From 8fb269d94075c70b676ff8b7c8baf695cc66c74f Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 16 Aug 2021 15:27:55 -0400 Subject: [PATCH 200/700] [InstCombine] add tests for smin/smax intrinsics with negated ops; NFC --- .../InstCombine/minmax-intrinsics.ll | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll index c21724b46308a..a3dbbc9dcbde4 100644 --- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll @@ -1030,3 +1030,115 @@ define i8 @umin_demand_and_7_8(i8 %x) { %r = and i8 %m, -8 ret i8 %r } + +define i8 @neg_neg_nsw_smax(i8 %x, i8 %y) { +; CHECK-LABEL: @neg_neg_nsw_smax( +; CHECK-NEXT: [[NX:%.*]] = sub nsw i8 0, [[X:%.*]] +; CHECK-NEXT: [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]] +; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smax.i8(i8 [[NX]], i8 [[NY]]) +; CHECK-NEXT: ret i8 [[M]] +; + %nx = sub nsw i8 0, %x + %ny = sub nsw i8 0, %y + %m = call i8 @llvm.smax.i8(i8 %nx, i8 %ny) + ret i8 %m +} + +define <3 x i8> @neg_neg_nsw_smin(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @neg_neg_nsw_smin( +; CHECK-NEXT: [[NX:%.*]] = sub nsw <3 x i8> zeroinitializer, [[X:%.*]] +; CHECK-NEXT: [[NY:%.*]] = sub nsw <3 x i8> zeroinitializer, [[Y:%.*]] +; CHECK-NEXT: [[M:%.*]] = call <3 x i8> @llvm.smin.v3i8(<3 x i8> [[NX]], <3 x i8> [[NY]]) +; CHECK-NEXT: ret <3 x i8> [[M]] +; + %nx = sub nsw <3 x i8> zeroinitializer, %x + %ny = sub nsw <3 x i8> zeroinitializer, %y + %m = call <3 x i8> @llvm.smin.v3i8(<3 x i8> %nx, <3 x i8> %ny) + ret <3 x i8> %m +} + +define i8 @neg_neg_nsw_smax_use0(i8 %x, i8 %y) { +; CHECK-LABEL: @neg_neg_nsw_smax_use0( +; CHECK-NEXT: [[NX:%.*]] = sub nsw i8 0, [[X:%.*]] +; CHECK-NEXT: call void @use(i8 [[NX]]) +; CHECK-NEXT: [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]] +; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smax.i8(i8 [[NX]], i8 [[NY]]) +; CHECK-NEXT: ret i8 [[M]] +; + %nx = sub nsw i8 0, %x + call void @use(i8 %nx) + %ny = sub nsw i8 0, %y + %m = call i8 @llvm.smax.i8(i8 %nx, i8 %ny) + ret i8 %m +} + +define i8 @neg_neg_nsw_smin_use1(i8 %x, i8 %y) { +; CHECK-LABEL: @neg_neg_nsw_smin_use1( +; CHECK-NEXT: [[NX:%.*]] = sub nsw i8 0, [[X:%.*]] +; CHECK-NEXT: [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use(i8 [[NY]]) +; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[NX]], i8 [[NY]]) +; CHECK-NEXT: ret i8 [[M]] +; + %nx = sub nsw i8 0, %x + %ny = sub nsw i8 0, %y + call void @use(i8 %ny) + %m = call i8 @llvm.smin.i8(i8 %nx, i8 %ny) + ret i8 %m +} + +define i8 @neg_neg_nsw_smin_use2(i8 %x, i8 %y) { +; CHECK-LABEL: @neg_neg_nsw_smin_use2( +; CHECK-NEXT: [[NX:%.*]] = sub nsw i8 0, [[X:%.*]] +; CHECK-NEXT: call void @use(i8 [[NX]]) +; CHECK-NEXT: [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use(i8 [[NY]]) +; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[NX]], i8 [[NY]]) +; CHECK-NEXT: ret i8 [[M]] +; + %nx = sub nsw i8 0, %x + call void @use(i8 %nx) + %ny = sub nsw i8 0, %y + call void @use(i8 %ny) + %m = call i8 @llvm.smin.i8(i8 %nx, i8 %ny) + ret i8 %m +} + +define i8 @neg_neg_smax(i8 %x, i8 %y) { +; CHECK-LABEL: @neg_neg_smax( +; CHECK-NEXT: [[NX:%.*]] = sub i8 0, [[X:%.*]] +; CHECK-NEXT: [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]] +; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smax.i8(i8 [[NX]], i8 [[NY]]) +; CHECK-NEXT: ret i8 [[M]] +; + %nx = sub i8 0, %x + %ny = sub nsw i8 0, %y + %m = call i8 @llvm.smax.i8(i8 %nx, i8 %ny) + ret i8 %m +} + +define i8 @neg_neg_smin(i8 %x, i8 %y) { +; CHECK-LABEL: @neg_neg_smin( +; CHECK-NEXT: [[NX:%.*]] = sub i8 0, [[X:%.*]] +; CHECK-NEXT: [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]] +; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[NX]], i8 [[NY]]) +; CHECK-NEXT: ret i8 [[M]] +; + %nx = sub i8 0, %x + %ny = sub nsw i8 0, %y + %m = call i8 @llvm.smin.i8(i8 %nx, i8 %ny) + ret i8 %m +} + +define i8 @neg_neg_nsw_umin(i8 %x, i8 %y) { +; CHECK-LABEL: @neg_neg_nsw_umin( +; CHECK-NEXT: [[NX:%.*]] = sub nsw i8 0, [[X:%.*]] +; CHECK-NEXT: [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]] +; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.umin.i8(i8 [[NX]], i8 [[NY]]) +; CHECK-NEXT: ret i8 [[M]] +; + %nx = sub nsw i8 0, %x + %ny = sub nsw i8 0, %y + %m = call i8 @llvm.umin.i8(i8 %nx, i8 %ny) + ret i8 %m +} From d0975b7cb0e184e8e5f3975183c51937dfa4043a Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 17 Aug 2021 07:31:05 -0400 Subject: [PATCH 201/700] [InstCombine] fold signed min/max intrinsics with negated operands If both operands are negated, we can invert the min/max and do the negation after: smax (neg nsw X), (neg nsw Y) --> neg nsw (smin X, Y) smin (neg nsw X), (neg nsw Y) --> neg nsw (smax X, Y) This is visible as a remaining regression in D98152. I don't see a way to generalize this for 'unsigned' or adapt Negator to handle it. This only appears to be safe with 'nsw': https://alive2.llvm.org/ce/z/GUy1zJ Differential Revision: https://reviews.llvm.org/D108165 --- .../InstCombine/InstCombineCalls.cpp | 11 ++++++++ .../InstCombine/minmax-intrinsics.ll | 26 ++++++++++++------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 25597840cad38..e3dd12c0c773a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1065,6 +1065,17 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } } + if (IID == Intrinsic::smax || IID == Intrinsic::smin) { + // smax (neg nsw X), (neg nsw Y) --> neg nsw (smin X, Y) + // smin (neg nsw X), (neg nsw Y) --> neg nsw (smax X, Y) + if (match(I0, m_NSWNeg(m_Value(X))) && match(I1, m_NSWNeg(m_Value(Y))) && + (I0->hasOneUse() || I1->hasOneUse())) { + Intrinsic::ID InvID = getInverseMinMaxIntrinsic(IID); + Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, X, Y); + return BinaryOperator::CreateNSWNeg(InvMaxMin); + } + } + if (match(I0, m_Not(m_Value(X)))) { // max (not X), (not Y) --> not (min X, Y) Intrinsic::ID InvID = getInverseMinMaxIntrinsic(IID); diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll index a3dbbc9dcbde4..b46ddabffd95a 100644 --- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll @@ -1033,9 +1033,8 @@ define i8 @umin_demand_and_7_8(i8 %x) { define i8 @neg_neg_nsw_smax(i8 %x, i8 %y) { ; CHECK-LABEL: @neg_neg_nsw_smax( -; CHECK-NEXT: [[NX:%.*]] = sub nsw i8 0, [[X:%.*]] -; CHECK-NEXT: [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]] -; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smax.i8(i8 [[NX]], i8 [[NY]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X:%.*]], i8 [[Y:%.*]]) +; CHECK-NEXT: [[M:%.*]] = sub nsw i8 0, [[TMP1]] ; CHECK-NEXT: ret i8 [[M]] ; %nx = sub nsw i8 0, %x @@ -1046,9 +1045,8 @@ define i8 @neg_neg_nsw_smax(i8 %x, i8 %y) { define <3 x i8> @neg_neg_nsw_smin(<3 x i8> %x, <3 x i8> %y) { ; CHECK-LABEL: @neg_neg_nsw_smin( -; CHECK-NEXT: [[NX:%.*]] = sub nsw <3 x i8> zeroinitializer, [[X:%.*]] -; CHECK-NEXT: [[NY:%.*]] = sub nsw <3 x i8> zeroinitializer, [[Y:%.*]] -; CHECK-NEXT: [[M:%.*]] = call <3 x i8> @llvm.smin.v3i8(<3 x i8> [[NX]], <3 x i8> [[NY]]) +; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i8> @llvm.smax.v3i8(<3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]]) +; CHECK-NEXT: [[M:%.*]] = sub nsw <3 x i8> zeroinitializer, [[TMP1]] ; CHECK-NEXT: ret <3 x i8> [[M]] ; %nx = sub nsw <3 x i8> zeroinitializer, %x @@ -1061,8 +1059,8 @@ define i8 @neg_neg_nsw_smax_use0(i8 %x, i8 %y) { ; CHECK-LABEL: @neg_neg_nsw_smax_use0( ; CHECK-NEXT: [[NX:%.*]] = sub nsw i8 0, [[X:%.*]] ; CHECK-NEXT: call void @use(i8 [[NX]]) -; CHECK-NEXT: [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]] -; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smax.i8(i8 [[NX]], i8 [[NY]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 [[Y:%.*]]) +; CHECK-NEXT: [[M:%.*]] = sub nsw i8 0, [[TMP1]] ; CHECK-NEXT: ret i8 [[M]] ; %nx = sub nsw i8 0, %x @@ -1074,10 +1072,10 @@ define i8 @neg_neg_nsw_smax_use0(i8 %x, i8 %y) { define i8 @neg_neg_nsw_smin_use1(i8 %x, i8 %y) { ; CHECK-LABEL: @neg_neg_nsw_smin_use1( -; CHECK-NEXT: [[NX:%.*]] = sub nsw i8 0, [[X:%.*]] ; CHECK-NEXT: [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]] ; CHECK-NEXT: call void @use(i8 [[NY]]) -; CHECK-NEXT: [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[NX]], i8 [[NY]]) +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 [[Y]]) +; CHECK-NEXT: [[M:%.*]] = sub nsw i8 0, [[TMP1]] ; CHECK-NEXT: ret i8 [[M]] ; %nx = sub nsw i8 0, %x @@ -1087,6 +1085,8 @@ define i8 @neg_neg_nsw_smin_use1(i8 %x, i8 %y) { ret i8 %m } +; negative test - too many uses + define i8 @neg_neg_nsw_smin_use2(i8 %x, i8 %y) { ; CHECK-LABEL: @neg_neg_nsw_smin_use2( ; CHECK-NEXT: [[NX:%.*]] = sub nsw i8 0, [[X:%.*]] @@ -1104,6 +1104,8 @@ define i8 @neg_neg_nsw_smin_use2(i8 %x, i8 %y) { ret i8 %m } +; negative test - need nsw on both ops + define i8 @neg_neg_smax(i8 %x, i8 %y) { ; CHECK-LABEL: @neg_neg_smax( ; CHECK-NEXT: [[NX:%.*]] = sub i8 0, [[X:%.*]] @@ -1117,6 +1119,8 @@ define i8 @neg_neg_smax(i8 %x, i8 %y) { ret i8 %m } +; negative test - need nsw on both ops + define i8 @neg_neg_smin(i8 %x, i8 %y) { ; CHECK-LABEL: @neg_neg_smin( ; CHECK-NEXT: [[NX:%.*]] = sub i8 0, [[X:%.*]] @@ -1130,6 +1134,8 @@ define i8 @neg_neg_smin(i8 %x, i8 %y) { ret i8 %m } +; negative test - need signed min/max + define i8 @neg_neg_nsw_umin(i8 %x, i8 %y) { ; CHECK-LABEL: @neg_neg_nsw_umin( ; CHECK-NEXT: [[NX:%.*]] = sub nsw i8 0, [[X:%.*]] From e73f4e11233323d66f1d47c4f8510f9bf812402d Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 17 Aug 2021 08:10:16 -0400 Subject: [PATCH 202/700] [InstCombine] remove unused function argument; NFC --- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index e3dd12c0c773a..579a4bb77377c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -796,13 +796,12 @@ static Instruction *foldClampRangeOfTwo(IntrinsicInst *II, } /// Reduce a sequence of min/max intrinsics with a common operand. -static Instruction *factorizeMinMaxTree(IntrinsicInst *II, - InstCombiner::BuilderTy &Builder) { +static Instruction *factorizeMinMaxTree(IntrinsicInst *II) { // Match 3 of the same min/max ops. Example: umin(umin(), umin()). auto *LHS = dyn_cast(II->getArgOperand(0)); auto *RHS = dyn_cast(II->getArgOperand(1)); Intrinsic::ID MinMaxID = II->getIntrinsicID(); - if (!LHS || !RHS || LHS->getIntrinsicID() != MinMaxID || + if (!LHS || !RHS || LHS->getIntrinsicID() != MinMaxID || RHS->getIntrinsicID() != MinMaxID || (!LHS->hasOneUse() && !RHS->hasOneUse())) return nullptr; @@ -1128,7 +1127,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { if (Instruction *R = FoldOpIntoSelect(*II, Sel)) return R; - if (Instruction *NewMinMax = factorizeMinMaxTree(II, Builder)) + if (Instruction *NewMinMax = factorizeMinMaxTree(II)) return NewMinMax; break; From 62e892fa2d4f372fddc5e4ef5134830f8fa20062 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 17 Aug 2021 13:51:34 +0100 Subject: [PATCH 203/700] [ARM] Add MQQPR and MQQQQPR spill and reload pseudo instructions As a part of D107642, this adds pseudo instructions for MQQPR and MQQQQPR register classes, that can spill and reloads entire registers whilst keeping them combined, not splitting them into multiple D subregs that a VLDMIA/VSTMIA would use. This can help certain analyses, and helps to prevent verifier issues with subreg liveness. --- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 40 +++++++++++++++-- llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp | 45 ++++++++++++++++++++ llvm/lib/Target/ARM/ARMInstrMVE.td | 20 +++++++++ 3 files changed, 101 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 643bc6d2d4e02..971eef1e9353e 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1254,6 +1254,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addReg(SrcReg, getKillRegState(isKill)) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); + } else if (Subtarget.hasMVEIntegerOps()) { + BuildMI(MBB, I, DebugLoc(), get(ARM::MQQPRStore)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI) + .addMemOperand(MMO); } else { MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::VSTMDIA)) @@ -1269,8 +1274,13 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, llvm_unreachable("Unknown reg class!"); break; case 64: - if (ARM::QQQQPRRegClass.hasSubClassEq(RC) || - ARM::MQQQQPRRegClass.hasSubClassEq(RC)) { + if (ARM::MQQQQPRRegClass.hasSubClassEq(RC) && + Subtarget.hasMVEIntegerOps()) { + BuildMI(MBB, I, DebugLoc(), get(ARM::MQQQQPRStore)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI) + .addMemOperand(MMO); + } else if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) { MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::VSTMDIA)) .addFrameIndex(FI) .add(predOps(ARMCC::AL)) @@ -1331,6 +1341,13 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI, return MI.getOperand(0).getReg(); } break; + case ARM::MQQPRStore: + case ARM::MQQQQPRStore: + if (MI.getOperand(1).isFI()) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); + } + break; } return 0; @@ -1486,6 +1503,10 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addImm(16) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); + } else if (Subtarget.hasMVEIntegerOps()) { + BuildMI(MBB, I, DL, get(ARM::MQQPRLoad), DestReg) + .addFrameIndex(FI) + .addMemOperand(MMO); } else { MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) .addFrameIndex(FI) @@ -1502,8 +1523,12 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, llvm_unreachable("Unknown reg class!"); break; case 64: - if (ARM::QQQQPRRegClass.hasSubClassEq(RC) || - ARM::MQQQQPRRegClass.hasSubClassEq(RC)) { + if (ARM::MQQQQPRRegClass.hasSubClassEq(RC) && + Subtarget.hasMVEIntegerOps()) { + BuildMI(MBB, I, DL, get(ARM::MQQQQPRLoad), DestReg) + .addFrameIndex(FI) + .addMemOperand(MMO); + } else if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) { MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA)) .addFrameIndex(FI) .add(predOps(ARMCC::AL)) @@ -1572,6 +1597,13 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, return MI.getOperand(0).getReg(); } break; + case ARM::MQQPRLoad: + case ARM::MQQQQPRLoad: + if (MI.getOperand(1).isFI()) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); + } + break; } return 0; diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index e68a3aa8bf478..0e8360814ae26 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -69,6 +69,7 @@ namespace { void ExpandLaneOp(MachineBasicBlock::iterator &MBBI); void ExpandVTBL(MachineBasicBlock::iterator &MBBI, unsigned Opc, bool IsExt); + void ExpandMQQPRLoadStore(MachineBasicBlock::iterator &MBBI); void ExpandMOV32BitImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI); void CMSEClearGPRegs(MachineBasicBlock &MBB, @@ -887,6 +888,43 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI, LLVM_DEBUG(dbgs() << "To: "; MIB.getInstr()->dump();); } +void ARMExpandPseudo::ExpandMQQPRLoadStore(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + unsigned NewOpc = + MI.getOpcode() == ARM::MQQPRStore || MI.getOpcode() == ARM::MQQQQPRStore + ? ARM::VSTMDIA + : ARM::VLDMDIA; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc)); + + unsigned Flags = getKillRegState(MI.getOperand(0).isKill()) | + getDefRegState(MI.getOperand(0).isDef()); + Register SrcReg = MI.getOperand(0).getReg(); + + // Copy the destination register. + MIB.add(MI.getOperand(1)); + MIB.add(predOps(ARMCC::AL)); + MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_0), Flags); + MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_1), Flags); + MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_2), Flags); + MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_3), Flags); + if (MI.getOpcode() == ARM::MQQQQPRStore || + MI.getOpcode() == ARM::MQQQQPRLoad) { + MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_4), Flags); + MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_5), Flags); + MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_6), Flags); + MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_7), Flags); + } + + if (NewOpc == ARM::VSTMDIA) + MIB.addReg(SrcReg, RegState::Implicit); + + TransferImpOps(MI, MIB, MIB); + MIB.cloneMemRefs(MI); + MI.eraseFromParent(); +} + static bool IsAnAddressOperand(const MachineOperand &MO) { // This check is overly conservative. Unless we are certain that the machine // operand is not a symbol reference, we return that it is a symbol reference. @@ -2916,6 +2954,13 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true; case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true; + case ARM::MQQPRLoad: + case ARM::MQQPRStore: + case ARM::MQQQQPRLoad: + case ARM::MQQQQPRStore: + ExpandMQQPRLoadStore(MBBI); + return true; + case ARM::tCMP_SWAP_8: assert(STI->isThumb()); return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB, ARM::tUXTB, diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 5577de05a6e35..0777532e58e04 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -6930,6 +6930,26 @@ def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> { } +// Pseudo instructions for lowering MQQPR and MQQQQPR stack spills and reloads. +// They are equivalent to VLDMDIA/VSTMDIA with a single reg, as opposed to multiple +// dreg subregs. + +let Predicates = [HasMVEInt], AM = AddrMode4 in { +let mayStore = 1, hasSideEffects = 0 in { + def MQQPRStore : t2PseudoInst<(outs), (ins MQQPR:$val, GPRnopc:$ptr), + 4, NoItinerary, []>; + def MQQQQPRStore : t2PseudoInst<(outs), (ins MQQQQPR:$val, GPRnopc:$ptr), + 4, NoItinerary, []>; +} +let mayLoad = 1, hasSideEffects = 0 in { + def MQQPRLoad : t2PseudoInst<(outs MQQPR:$val), (ins GPRnopc:$ptr), + 4, NoItinerary, []>; + def MQQQQPRLoad : t2PseudoInst<(outs MQQQQPR:$val), (ins GPRnopc:$ptr), + 4, NoItinerary, []>; +} +} + + //===----------------------------------------------------------------------===// // Patterns //===----------------------------------------------------------------------===// From 52e0cf9d61618353d2745a51a16ae408edf0f49b Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 17 Aug 2021 14:10:33 +0100 Subject: [PATCH 204/700] [ARM] Enable subreg liveness This enables subreg liveness in the arm backend when MVE is present, which allows the register allocator to detect when subregister are alive/dead, compared to only acting on full registers. This can helps produce better code on MVE with the way MQPR registers are made up of SPR registers, but is especially helpful for MQQPR and MQQQQPR registers, where there are very few "registers" available and being able to split them up into subregs can help produce much better code. Differential Revision: https://reviews.llvm.org/D107642 --- llvm/lib/Target/ARM/ARMSubtarget.cpp | 8 +- .../Thumb2/LowOverheadLoops/fast-fp-loops.ll | 41 +- .../LowOverheadLoops/lsr-profitable-chain.ll | 2 +- .../Thumb2/LowOverheadLoops/mov-operand.ll | 8 +- .../LowOverheadLoops/mve-float-loops.ll | 40 +- .../tail-pred-intrinsic-round.ll | 10 +- .../Thumb2/LowOverheadLoops/unpredload.ll | 4 +- llvm/test/CodeGen/Thumb2/active_lane_mask.ll | 6 +- llvm/test/CodeGen/Thumb2/mve-be.ll | 4 +- llvm/test/CodeGen/Thumb2/mve-ctlz.ll | 18 +- llvm/test/CodeGen/Thumb2/mve-ctpop.ll | 2 +- llvm/test/CodeGen/Thumb2/mve-cttz.ll | 18 +- llvm/test/CodeGen/Thumb2/mve-div-expand.ll | 76 +- .../CodeGen/Thumb2/mve-float16regloops.ll | 60 +- .../CodeGen/Thumb2/mve-float32regloops.ll | 136 +- llvm/test/CodeGen/Thumb2/mve-fmas.ll | 720 ++++---- llvm/test/CodeGen/Thumb2/mve-fmath.ll | 327 ++-- llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll | 84 +- llvm/test/CodeGen/Thumb2/mve-frint.ll | 252 ++- .../CodeGen/Thumb2/mve-gather-ind16-scaled.ll | 20 +- llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll | 38 +- .../CodeGen/Thumb2/mve-intrinsics/vld24.ll | 16 - .../Thumb2/mve-laneinterleaving-cost.ll | 126 +- .../CodeGen/Thumb2/mve-laneinterleaving.ll | 61 +- llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll | 112 +- llvm/test/CodeGen/Thumb2/mve-masked-store.ll | 100 +- llvm/test/CodeGen/Thumb2/mve-minmax.ll | 50 +- llvm/test/CodeGen/Thumb2/mve-nofloat.ll | 18 +- llvm/test/CodeGen/Thumb2/mve-phireg.ll | 30 +- llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll | 166 +- .../test/CodeGen/Thumb2/mve-pred-build-var.ll | 6 +- llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll | 10 +- llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll | 22 +- .../Thumb2/mve-scatter-ind16-scaled.ll | 14 +- .../Thumb2/mve-scatter-ind16-unscaled.ll | 24 +- .../Thumb2/mve-scatter-ind32-unscaled.ll | 4 +- llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll | 16 +- .../CodeGen/Thumb2/mve-sext-masked-load.ll | 21 +- llvm/test/CodeGen/Thumb2/mve-shuffle.ll | 364 ++-- llvm/test/CodeGen/Thumb2/mve-shuffleext.ll | 40 +- llvm/test/CodeGen/Thumb2/mve-shufflemov.ll | 12 +- llvm/test/CodeGen/Thumb2/mve-simple-arith.ll | 150 +- .../test/CodeGen/Thumb2/mve-soft-float-abi.ll | 58 +- llvm/test/CodeGen/Thumb2/mve-vabdus.ll | 44 +- llvm/test/CodeGen/Thumb2/mve-vcmpf.ll | 962 +++++----- llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll | 986 +++++------ llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll | 1568 ++++++++--------- llvm/test/CodeGen/Thumb2/mve-vcvt.ll | 144 +- llvm/test/CodeGen/Thumb2/mve-vcvt16.ll | 41 +- llvm/test/CodeGen/Thumb2/mve-vdup.ll | 1 - .../test/CodeGen/Thumb2/mve-vecreduce-fadd.ll | 202 +-- .../CodeGen/Thumb2/mve-vecreduce-fminmax.ll | 960 +++++----- .../test/CodeGen/Thumb2/mve-vecreduce-fmul.ll | 206 +-- .../CodeGen/Thumb2/mve-vecreduce-loops.ll | 16 +- llvm/test/CodeGen/Thumb2/mve-vhadd.ll | 128 +- llvm/test/CodeGen/Thumb2/mve-vld2-post.ll | 8 +- llvm/test/CodeGen/Thumb2/mve-vld2.ll | 108 +- llvm/test/CodeGen/Thumb2/mve-vld3.ll | 810 ++++----- llvm/test/CodeGen/Thumb2/mve-vld4-post.ll | 58 +- llvm/test/CodeGen/Thumb2/mve-vld4.ll | 501 +++--- llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll | 12 +- llvm/test/CodeGen/Thumb2/mve-vldst4.ll | 176 +- llvm/test/CodeGen/Thumb2/mve-vmovn.ll | 80 +- llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll | 76 +- llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll | 68 +- llvm/test/CodeGen/Thumb2/mve-vst2-post.ll | 12 +- llvm/test/CodeGen/Thumb2/mve-vst2.ll | 165 +- llvm/test/CodeGen/Thumb2/mve-vst3.ll | 1218 ++++++------- llvm/test/CodeGen/Thumb2/mve-vst4-post.ll | 20 +- llvm/test/CodeGen/Thumb2/mve-vst4.ll | 497 +++--- .../CodeGen/Thumb2/mve-zext-masked-load.ll | 4 +- 71 files changed, 5869 insertions(+), 6496 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index 90f1b693fec60..2e5bbb66604dd 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -389,7 +389,13 @@ bool ARMSubtarget::enableMachineScheduler() const { return useMachineScheduler(); } -bool ARMSubtarget::enableSubRegLiveness() const { return EnableSubRegLiveness; } +bool ARMSubtarget::enableSubRegLiveness() const { + if (EnableSubRegLiveness.getNumOccurrences()) + return EnableSubRegLiveness; + // Enable SubRegLiveness for MVE to better optimize s subregs for mqpr regs + // and q subregs for qqqqpr regs. + return hasMVEIntegerOps(); +} // This overrides the PostRAScheduler bit in the SchedModel for any CPU. bool ARMSubtarget::enablePostRAScheduler() const { diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll index f101dd4fcec93..603f667d0c615 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -227,11 +227,9 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vadd.f32 q0, q0, r0 -; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: vldr s0, .LCPI1_0 -; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.5: @@ -280,7 +278,7 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h ; CHECK-LABEL: fast_float_half_mac: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq.w .LBB2_20 @@ -303,13 +301,13 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vmul.f16 q5, q6, q5 ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: vcvtt.f32.f16 s27, s21 +; CHECK-NEXT: vcvtt.f32.f16 s23, s21 +; CHECK-NEXT: vcvtb.f32.f16 s22, s21 +; CHECK-NEXT: vcvtt.f32.f16 s21, s20 +; CHECK-NEXT: vcvtb.f32.f16 s20, s20 ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: vcvtb.f32.f16 s26, s21 ; CHECK-NEXT: adds r3, #4 -; CHECK-NEXT: vcvtt.f32.f16 s25, s20 -; CHECK-NEXT: vcvtb.f32.f16 s24, s20 -; CHECK-NEXT: vadd.f32 q5, q3, q6 +; CHECK-NEXT: vadd.f32 q5, q3, q5 ; CHECK-NEXT: subs.w lr, lr, #1 ; CHECK-NEXT: bne .LBB2_3 ; CHECK-NEXT: b .LBB2_19 @@ -349,8 +347,8 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h ; CHECK-NEXT: bpl .LBB2_8 ; CHECK-NEXT: .LBB2_7: @ %cond.load12 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vldr.16 s24, [r0, #6] -; CHECK-NEXT: vins.f16 s21, s24 +; CHECK-NEXT: vldr.16 s22, [r0, #6] +; CHECK-NEXT: vins.f16 s21, s22 ; CHECK-NEXT: .LBB2_8: @ %else13 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vcmp.u32 cs, q2, q4 @@ -391,15 +389,15 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h ; CHECK-NEXT: bpl .LBB2_5 ; CHECK-NEXT: .LBB2_13: @ %cond.load6 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vldr.16 s24, [r0, #2] -; CHECK-NEXT: vins.f16 s20, s24 +; CHECK-NEXT: vldr.16 s22, [r0, #2] +; CHECK-NEXT: vins.f16 s20, s22 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bpl .LBB2_6 ; CHECK-NEXT: .LBB2_14: @ %cond.load9 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vmovx.f16 s24, s21 ; CHECK-NEXT: vldr.16 s21, [r0, #4] -; CHECK-NEXT: vins.f16 s21, s24 +; CHECK-NEXT: vmovx.f16 s22, s0 +; CHECK-NEXT: vins.f16 s21, s22 ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bmi .LBB2_7 ; CHECK-NEXT: b .LBB2_8 @@ -410,21 +408,21 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h ; CHECK-NEXT: bpl .LBB2_10 ; CHECK-NEXT: .LBB2_16: @ %cond.load19 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vldr.16 s28, [r1, #2] -; CHECK-NEXT: vins.f16 s24, s28 +; CHECK-NEXT: vldr.16 s26, [r1, #2] +; CHECK-NEXT: vins.f16 s24, s26 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bpl .LBB2_11 ; CHECK-NEXT: .LBB2_17: @ %cond.load22 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vmovx.f16 s28, s25 ; CHECK-NEXT: vldr.16 s25, [r1, #4] -; CHECK-NEXT: vins.f16 s25, s28 +; CHECK-NEXT: vmovx.f16 s26, s0 +; CHECK-NEXT: vins.f16 s25, s26 ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bpl.w .LBB2_2 ; CHECK-NEXT: .LBB2_18: @ %cond.load25 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vldr.16 s28, [r1, #6] -; CHECK-NEXT: vins.f16 s25, s28 +; CHECK-NEXT: vldr.16 s26, [r1, #6] +; CHECK-NEXT: vins.f16 s25, s26 ; CHECK-NEXT: b .LBB2_2 ; CHECK-NEXT: .LBB2_19: @ %middle.block ; CHECK-NEXT: vdup.32 q0, r12 @@ -439,9 +437,8 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h ; CHECK-NEXT: .LBB2_20: ; CHECK-NEXT: vldr s0, .LCPI2_0 ; CHECK-NEXT: .LBB2_21: @ %for.cond.cleanup -; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.22: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll index fddbfa8b66207..856e150e6012d 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll @@ -14,8 +14,8 @@ define arm_aapcs_vfpcc float @vctpi32(float* %0, i32 %1) { ; CHECK-NEXT: vmvn.i32 q1, #0x1f ; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vadd.i32 q1, q3, q1 ; CHECK-NEXT: subs r3, r1, #1 +; CHECK-NEXT: vadd.i32 q1, q3, q1 ; CHECK-NEXT: vidup.u32 q2, r2, #8 ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vadd.i32 q1, q2, r0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll index 8c23b09e650bf..9162d4a3f2142 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -15,10 +15,10 @@ define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %arm_mean_f32_mve.exit -; CHECK-NEXT: vmov s4, r1 -; CHECK-NEXT: vadd.f32 s0, s3, s3 -; CHECK-NEXT: vcvt.f32.u32 s4, s4 -; CHECK-NEXT: vdiv.f32 s0, s0, s4 +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vadd.f32 s2, s3, s3 +; CHECK-NEXT: vcvt.f32.u32 s0, s0 +; CHECK-NEXT: vdiv.f32 s0, s2, s0 ; CHECK-NEXT: vmov r12, s0 ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.32 lr, r1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll index 2974db0d816b9..f1b3014e358a0 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -986,11 +986,11 @@ define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* no ; CHECK-NEXT: vmov.32 q1[1], r10 ; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vmul.f16 q0, q0, q1 -; CHECK-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-NEXT: vcvtb.f32.f16 s4, s0 -; CHECK-NEXT: vstrb.8 q1, [r6], #16 +; CHECK-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB5_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1100,11 +1100,11 @@ define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* no ; CHECK-NEXT: vmov.32 q1[1], r10 ; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-NEXT: vcvtb.f32.f16 s4, s0 -; CHECK-NEXT: vstrb.8 q1, [r6], #16 +; CHECK-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB6_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1214,11 +1214,11 @@ define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* no ; CHECK-NEXT: vmov.32 q1[1], r10 ; CHECK-NEXT: adds r5, #8 ; CHECK-NEXT: vsub.f16 q0, q0, q1 -; CHECK-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-NEXT: vcvtb.f32.f16 s4, s0 -; CHECK-NEXT: vstrb.8 q1, [r6], #16 +; CHECK-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB7_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 @@ -1333,11 +1333,11 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* no ; CHECK-NEXT: vmov.16 q0[3], r8 ; CHECK-NEXT: vcvt.f16.s16 q0, q0 ; CHECK-NEXT: vmul.f16 q0, q1, q0 -; CHECK-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-NEXT: vcvtb.f32.f16 s4, s0 -; CHECK-NEXT: vstrb.8 q1, [r6], #16 +; CHECK-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-NEXT: vstrb.8 q0, [r6], #16 ; CHECK-NEXT: le lr, .LBB8_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll index 024857b658023..b01a0cc047c29 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll @@ -240,11 +240,11 @@ define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA, ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: vrintr.f32 s7, s3 -; CHECK-NEXT: vrintr.f32 s6, s2 -; CHECK-NEXT: vrintr.f32 s5, s1 -; CHECK-NEXT: vrintr.f32 s4, s0 -; CHECK-NEXT: vstrw.32 q1, [r1], #16 +; CHECK-NEXT: vrintr.f32 s3, s3 +; CHECK-NEXT: vrintr.f32 s2, s2 +; CHECK-NEXT: vrintr.f32 s1, s1 +; CHECK-NEXT: vrintr.f32 s0, s0 +; CHECK-NEXT: vstrw.32 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll index d5a44e41e77f9..29174b44cd45a 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll @@ -11,9 +11,9 @@ define void @arm_cmplx_mag_squared_q15_mve(i16* %pSrc, i16* %pDst, i32 %blockSiz ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] ; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! -; CHECK-NEXT: vmulh.s16 q2, q1, q1 +; CHECK-NEXT: vmulh.s16 q1, q1, q1 ; CHECK-NEXT: vmulh.s16 q0, q0, q0 -; CHECK-NEXT: vqadd.s16 q0, q0, q2 +; CHECK-NEXT: vqadd.s16 q0, q0, q1 ; CHECK-NEXT: vshr.s16 q0, q0, #1 ; CHECK-NEXT: vstrh.16 q0, [r1], #16 ; CHECK-NEXT: letp lr, .LBB0_1 diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll index 98f00707df37c..607a55b52370b 100644 --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -14,9 +14,9 @@ define <4 x i32> @v4i32(i32 %index, i32 %TC, <4 x i32> %V1, <4 x i32> %V2) { ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst ; CHECK-NEXT: vcmpt.u32 hi, q1, q0 -; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vldr d1, [sp] ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 @@ -156,8 +156,8 @@ define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) { ; CHECK-NEXT: vpnot ; CHECK-NEXT: vpst ; CHECK-NEXT: vcmpt.i16 ne, q0, zr -; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vldr d1, [sp, #48] +; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 @@ -339,12 +339,12 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) { ; CHECK-NEXT: vmov.8 q3[14], r0 ; CHECK-NEXT: vmov.u16 r0, q0[7] ; CHECK-NEXT: vmov.8 q3[15], r0 -; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: add r0, sp, #88 ; CHECK-NEXT: vcmp.i8 ne, q3, zr ; CHECK-NEXT: vldr d1, [sp, #80] ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vpnot +; CHECK-NEXT: vmov d0, r2, r3 ; CHECK-NEXT: vpst ; CHECK-NEXT: vcmpt.i8 ne, q2, zr ; CHECK-NEXT: vpsel q0, q0, q1 diff --git a/llvm/test/CodeGen/Thumb2/mve-be.ll b/llvm/test/CodeGen/Thumb2/mve-be.ll index d941036488ba6..534530d772418 100644 --- a/llvm/test/CodeGen/Thumb2/mve-be.ll +++ b/llvm/test/CodeGen/Thumb2/mve-be.ll @@ -70,10 +70,10 @@ entry: define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LE-LABEL: add_soft: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vmov d0, r0, r1 ; CHECK-LE-NEXT: mov r0, sp ; CHECK-LE-NEXT: vldrw.u32 q1, [r0] +; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vadd.i32 q0, q0, q1 ; CHECK-LE-NEXT: vmov r0, r1, d0 ; CHECK-LE-NEXT: vmov r2, r3, d1 @@ -81,9 +81,9 @@ define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) { ; ; CHECK-BE-LABEL: add_soft: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vmov d0, r1, r0 ; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0] ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 diff --git a/llvm/test/CodeGen/Thumb2/mve-ctlz.ll b/llvm/test/CodeGen/Thumb2/mve-ctlz.ll index c44a9efe39573..eee41da87423d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-ctlz.ll +++ b/llvm/test/CodeGen/Thumb2/mve-ctlz.ll @@ -12,8 +12,10 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_0_t(<2 x i64> %src){ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r0, r1 -; CHECK-NEXT: vmov s6, r0 +; CHECK-NEXT: vmov s2, r0 ; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vldr s1, .LCPI0_0 +; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: cset r2, ne @@ -21,10 +23,7 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_0_t(<2 x i64> %src){ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r0, r1 -; CHECK-NEXT: vmov s4, r0 -; CHECK-NEXT: vldr s5, .LCPI0_0 -; CHECK-NEXT: vmov.f32 s7, s5 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: @@ -76,8 +75,10 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_1_t(<2 x i64> %src){ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r0, r1 -; CHECK-NEXT: vmov s6, r0 +; CHECK-NEXT: vmov s2, r0 ; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vldr s1, .LCPI4_0 +; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: cset r2, ne @@ -85,10 +86,7 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_1_t(<2 x i64> %src){ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r0, r1 -; CHECK-NEXT: vmov s4, r0 -; CHECK-NEXT: vldr s5, .LCPI4_0 -; CHECK-NEXT: vmov.f32 s7, s5 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll index f6f51068dd5f8..724bd4f7963b8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll @@ -12,6 +12,7 @@ define arm_aapcs_vfpcc <2 x i64> @ctpop_2i64_t(<2 x i64> %src){ ; CHECK-NEXT: vmov r3, r4, d0 ; CHECK-NEXT: mov.w r12, #858993459 ; CHECK-NEXT: vldr s1, .LCPI0_0 +; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: and.w r0, lr, r2, lsr #1 ; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: and.w r2, r12, r0, lsr #2 @@ -51,7 +52,6 @@ define arm_aapcs_vfpcc <2 x i64> @ctpop_2i64_t(<2 x i64> %src){ ; CHECK-NEXT: vmov s2, r1 ; CHECK-NEXT: add.w r0, r2, r0, lsr #24 ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/Thumb2/mve-cttz.ll b/llvm/test/CodeGen/Thumb2/mve-cttz.ll index b844bc217e571..e5d4a93ee4f67 100644 --- a/llvm/test/CodeGen/Thumb2/mve-cttz.ll +++ b/llvm/test/CodeGen/Thumb2/mve-cttz.ll @@ -4,8 +4,7 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_0_t(<2 x i64> %src){ ; CHECK-LABEL: cttz_2i64_0_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: clz r1, r1 @@ -16,7 +15,9 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_0_t(<2 x i64> %src){ ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r1, r0 ; CHECK-NEXT: vmov s2, r1 -; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vldr s1, .LCPI0_0 +; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: clz r1, r1 @@ -27,8 +28,6 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_0_t(<2 x i64> %src){ ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r1, r0 ; CHECK-NEXT: vmov s0, r1 -; CHECK-NEXT: vldr s1, .LCPI0_0 -; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: @@ -78,8 +77,7 @@ entry: define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_1_t(<2 x i64> %src){ ; CHECK-LABEL: cttz_2i64_1_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vmov r0, r1, d1 ; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: clz r1, r1 @@ -90,7 +88,9 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_1_t(<2 x i64> %src){ ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r1, r0 ; CHECK-NEXT: vmov s2, r1 -; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vldr s1, .LCPI4_0 +; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: rbit r1, r1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: clz r1, r1 @@ -101,8 +101,6 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_1_t(<2 x i64> %src){ ; CHECK-NEXT: it ne ; CHECK-NEXT: clzne r1, r0 ; CHECK-NEXT: vmov s0, r1 -; CHECK-NEXT: vldr s1, .LCPI4_0 -; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll index 3a746fc749feb..bb853f698cdfd 100644 --- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll +++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll @@ -724,11 +724,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @fdiv_f32(<4 x float> %in1, <4 x float> %in2) { ; CHECK-LABEL: fdiv_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdiv.f32 s11, s3, s7 -; CHECK-NEXT: vdiv.f32 s10, s2, s6 -; CHECK-NEXT: vdiv.f32 s9, s1, s5 -; CHECK-NEXT: vdiv.f32 s8, s0, s4 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vdiv.f32 s3, s3, s7 +; CHECK-NEXT: vdiv.f32 s2, s2, s6 +; CHECK-NEXT: vdiv.f32 s1, s1, s5 +; CHECK-NEXT: vdiv.f32 s0, s0, s4 ; CHECK-NEXT: bx lr entry: %out = fdiv <4 x float> %in1, %in2 @@ -774,27 +773,26 @@ entry: define arm_aapcs_vfpcc <8 x half> @fdiv_f16(<8 x half> %in1, <8 x half> %in2) { ; CHECK-LABEL: fdiv_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmovx.f16 s2, s8 -; CHECK-NEXT: vmovx.f16 s14, s9 -; CHECK-NEXT: vdiv.f16 s12, s2, s0 -; CHECK-NEXT: vdiv.f16 s0, s8, s4 -; CHECK-NEXT: vins.f16 s0, s12 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vdiv.f16 s12, s14, s12 -; CHECK-NEXT: vdiv.f16 s1, s9, s5 -; CHECK-NEXT: vins.f16 s1, s12 -; CHECK-NEXT: vmovx.f16 s12, s6 -; CHECK-NEXT: vmovx.f16 s14, s10 -; CHECK-NEXT: vdiv.f16 s2, s10, s6 -; CHECK-NEXT: vdiv.f16 s12, s14, s12 -; CHECK-NEXT: vmovx.f16 s14, s11 -; CHECK-NEXT: vins.f16 s2, s12 -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vdiv.f16 s12, s14, s12 -; CHECK-NEXT: vdiv.f16 s3, s11, s7 -; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vmovx.f16 s10, s0 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vdiv.f16 s0, s0, s4 +; CHECK-NEXT: vdiv.f16 s8, s10, s8 +; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vdiv.f16 s1, s1, s5 +; CHECK-NEXT: vdiv.f16 s4, s8, s4 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vdiv.f16 s2, s2, s6 +; CHECK-NEXT: vdiv.f16 s4, s8, s4 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s6, s3 +; CHECK-NEXT: vmovx.f16 s4, s7 +; CHECK-NEXT: vdiv.f16 s3, s3, s7 +; CHECK-NEXT: vdiv.f16 s4, s6, s4 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: %out = fdiv <8 x half> %in1, %in2 @@ -806,8 +804,8 @@ define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q5, q0 ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vcvtb.f32.f16 s0, s20 @@ -816,59 +814,59 @@ define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) { ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vcvtt.f32.f16 s0, s20 -; CHECK-NEXT: vmov s24, r0 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s24, s24 -; CHECK-NEXT: vcvtt.f16.f32 s24, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s21 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s25, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s21 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s25, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s22 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s26, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s22 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s26, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s23 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s27, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s23 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s27, s0 -; CHECK-NEXT: vmov q0, q6 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} entry: %out = frem <8 x half> %in1, %in2 diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll index b5c6c216affa3..4af4ec0b885cd 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -1422,22 +1422,22 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: ldrd r12, r6, [r0, #4] -; CHECK-NEXT: and r8, r3, #1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vldr.16 s4, .LCPI17_0 +; CHECK-NEXT: and r8, r3, #1 +; CHECK-NEXT: vldr.16 s0, .LCPI17_0 ; CHECK-NEXT: lsr.w r9, r3, #1 -; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: b .LBB17_3 ; CHECK-NEXT: .LBB17_1: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 -; CHECK-NEXT: vstr.16 s8, [r12] -; CHECK-NEXT: vmovx.f16 s9, s8 +; CHECK-NEXT: vmovx.f16 s5, s4 +; CHECK-NEXT: vstr.16 s4, [r12] ; CHECK-NEXT: .LBB17_2: @ %if.end ; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 -; CHECK-NEXT: vstr.16 s9, [r12, #2] +; CHECK-NEXT: vstr.16 s5, [r12, #2] ; CHECK-NEXT: adds r6, #10 ; CHECK-NEXT: subs r0, #1 ; CHECK-NEXT: add.w r12, r12, #4 @@ -1446,15 +1446,15 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc ; CHECK-NEXT: .LBB17_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB17_5 Depth 2 -; CHECK-NEXT: vldrh.u16 q3, [r6] +; CHECK-NEXT: vldrh.u16 q2, [r6] ; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vshlc q4, r5, #16 +; CHECK-NEXT: vldrh.u16 q3, [r6, #4] ; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vshlc q5, r5, #16 -; CHECK-NEXT: vldrh.u16 q4, [r6, #4] -; CHECK-NEXT: vmov q6, q4 -; CHECK-NEXT: vshlc q6, r5, #16 -; CHECK-NEXT: vldrh.u16 q2, [r12] -; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vldrh.u16 q1, [r12] +; CHECK-NEXT: vmov.f32 s5, s1 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: wls lr, r9, .LBB17_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.preheader @@ -1464,19 +1464,19 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc ; CHECK-NEXT: @ Parent Loop BB17_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrh r7, [r1], #4 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vfma.f16 q2, q3, r7 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vfma.f16 q1, q2, r7 ; CHECK-NEXT: ldrh r4, [r1, #-2] -; CHECK-NEXT: vmov.u16 r7, q2[0] -; CHECK-NEXT: vfma.f16 q2, q4, r7 -; CHECK-NEXT: vins.f16 s9, s4 -; CHECK-NEXT: vfma.f16 q2, q5, r4 -; CHECK-NEXT: vmov.u16 r4, q2[1] -; CHECK-NEXT: vfma.f16 q2, q6, r4 +; CHECK-NEXT: vmov.u16 r7, q1[0] +; CHECK-NEXT: vfma.f16 q1, q3, r7 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vfma.f16 q1, q4, r4 +; CHECK-NEXT: vmov.u16 r4, q1[1] +; CHECK-NEXT: vfma.f16 q1, q5, r4 ; CHECK-NEXT: strh r4, [r5, #2] -; CHECK-NEXT: vmov.f32 s8, s9 +; CHECK-NEXT: vmov.f32 s4, s5 ; CHECK-NEXT: strh r7, [r5], #4 -; CHECK-NEXT: vmov.16 q2[2], r3 +; CHECK-NEXT: vmov.16 q1[2], r3 ; CHECK-NEXT: le lr, .LBB17_5 ; CHECK-NEXT: .LBB17_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 @@ -1485,15 +1485,15 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vfma.f16 q2, q3, r1 -; CHECK-NEXT: vmov.u16 r1, q2[0] -; CHECK-NEXT: vfma.f16 q2, q4, r1 +; CHECK-NEXT: vfma.f16 q1, q2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[0] +; CHECK-NEXT: vfma.f16 q1, q3, r1 ; CHECK-NEXT: strh r1, [r5] -; CHECK-NEXT: vmovx.f16 s6, s8 -; CHECK-NEXT: vstr.16 s6, [r12] +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vstr.16 s2, [r12] ; CHECK-NEXT: b .LBB17_2 ; CHECK-NEXT: .LBB17_8: @ %do.end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} ; CHECK-NEXT: .p2align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index 39ff830e7be63..58177a877338c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1416,8 +1416,8 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biqu ; CHECK-NEXT: @ Child Loop BB17_3 Depth 2 ; CHECK-NEXT: ldrd r5, r7, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: vldr s8, [r0, #8] ; CHECK-NEXT: ldr r6, [r0, #12] +; CHECK-NEXT: vldr s8, [r0, #8] ; CHECK-NEXT: vstrw.32 q1, [r4] ; CHECK-NEXT: vdup.32 q1, r7 ; CHECK-NEXT: vldr s12, [r0, #16] @@ -1647,8 +1647,8 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: ldrd r12, r10, [r0] ; CHECK-NEXT: @ implicit-def: $s2 ; CHECK-NEXT: and r7, r3, #3 @@ -1656,19 +1656,19 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_ ; CHECK-NEXT: lsrs r0, r3, #2 ; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r2, [sp, #44] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: b .LBB19_3 ; CHECK-NEXT: .LBB19_1: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: vmov.f32 s0, s10 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s7, s6 ; CHECK-NEXT: .LBB19_2: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vstr s8, [r10] +; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: subs.w r12, r12, #1 -; CHECK-NEXT: vstr s0, [r10, #4] +; CHECK-NEXT: vstr s1, [r10] ; CHECK-NEXT: add.w r9, r9, #128 +; CHECK-NEXT: vstr s4, [r10, #4] ; CHECK-NEXT: vstr s14, [r10, #8] ; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: vstr s7, [r10, #12] @@ -1677,48 +1677,48 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_ ; CHECK-NEXT: .LBB19_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB19_5 Depth 2 -; CHECK-NEXT: vldr s7, [r10, #8] -; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: vldr s8, [r10] -; CHECK-NEXT: vldr s10, [r10, #4] +; CHECK-NEXT: mov r5, r2 +; CHECK-NEXT: vldr s1, [r10] +; CHECK-NEXT: vldr s3, [r10, #4] +; CHECK-NEXT: vldr s7, [r10, #8] ; CHECK-NEXT: vldr s6, [r10, #12] ; CHECK-NEXT: wls lr, r0, .LBB19_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r5, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: .LBB19_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB19_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vldr s8, [r1, #12] -; CHECK-NEXT: vldrw.u32 q0, [r9, #112] -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vldr s10, [r1, #8] ; CHECK-NEXT: vmov r7, s7 +; CHECK-NEXT: vldrw.u32 q2, [r9, #16] ; CHECK-NEXT: vmov r11, s6 +; CHECK-NEXT: vldrw.u32 q1, [r9, #112] +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vldr s1, [r1, #12] +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vldr s3, [r1, #8] +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [r9] -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov r8, s8 -; CHECK-NEXT: vldrw.u32 q0, [r9, #16] +; CHECK-NEXT: vmov r8, s1 ; CHECK-NEXT: ldr r6, [r1, #4] ; CHECK-NEXT: vldrw.u32 q7, [r9, #32] ; CHECK-NEXT: vmul.f32 q1, q1, r8 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vldrw.u32 q3, [r9, #48] -; CHECK-NEXT: vfma.f32 q1, q0, r0 +; CHECK-NEXT: vfma.f32 q1, q2, r0 ; CHECK-NEXT: ldr r0, [r1], #16 ; CHECK-NEXT: vfma.f32 q1, q7, r6 ; CHECK-NEXT: vldrw.u32 q6, [r9, #64] +; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: vfma.f32 q1, q3, r0 ; CHECK-NEXT: vldrw.u32 q5, [r9, #80] ; CHECK-NEXT: vfma.f32 q1, q6, r4 ; CHECK-NEXT: vldrw.u32 q4, [r9, #96] +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vfma.f32 q1, q5, r3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vfma.f32 q1, q4, r7 -; CHECK-NEXT: vfma.f32 q1, q0, r11 -; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vfma.f32 q1, q2, r11 ; CHECK-NEXT: vstrb.8 q1, [r5], #16 ; CHECK-NEXT: le lr, .LBB19_5 ; CHECK-NEXT: .LBB19_6: @ %while.end @@ -1728,74 +1728,68 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_ ; CHECK-NEXT: beq .LBB19_1 ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vldr s24, [r1] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vldr s0, [r1, #4] -; CHECK-NEXT: vldrw.u32 q3, [r9] -; CHECK-NEXT: vldr s3, [r1, #12] -; CHECK-NEXT: vldrw.u32 q4, [r9, #32] -; CHECK-NEXT: vldr s1, [r1, #8] -; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: vldrw.u32 q2, [r9, #96] -; CHECK-NEXT: vmov r6, s3 +; CHECK-NEXT: vmov lr, s6 +; CHECK-NEXT: vldr s6, [r1, #12] +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vstrw.32 q2, [sp, #8] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r9, #112] +; CHECK-NEXT: vldr s1, [r1, #8] +; CHECK-NEXT: vldrw.u32 q3, [r9] +; CHECK-NEXT: vldr s4, [r1, #4] +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov r6, s6 +; CHECK-NEXT: vldrw.u32 q2, [r9, #16] +; CHECK-NEXT: vldr s0, [r1] ; CHECK-NEXT: vmul.f32 q3, q3, r6 ; CHECK-NEXT: vmov r6, s1 -; CHECK-NEXT: vstrw.32 q2, [sp, #24] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r9, #112] +; CHECK-NEXT: vldrw.u32 q4, [r9, #32] +; CHECK-NEXT: vfma.f32 q3, q2, r6 +; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: vldrw.u32 q5, [r9, #48] -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r9, #80] ; CHECK-NEXT: vldrw.u32 q7, [r9, #64] -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vstrw.32 q2, [sp, #8] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r9, #16] -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: cmp r7, #1 -; CHECK-NEXT: vfma.f32 q3, q2, r6 -; CHECK-NEXT: vldrw.u32 q2, [sp, #8] @ 16-byte Reload +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vfma.f32 q3, q4, r4 -; CHECK-NEXT: vmov lr, s6 ; CHECK-NEXT: vfma.f32 q3, q5, r3 +; CHECK-NEXT: vldrw.u32 q6, [r9, #80] +; CHECK-NEXT: vmov r1, s3 ; CHECK-NEXT: vfma.f32 q3, q7, r0 -; CHECK-NEXT: vfma.f32 q3, q2, r1 -; CHECK-NEXT: vldrw.u32 q2, [sp, #24] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #8] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vfma.f32 q3, q6, r1 +; CHECK-NEXT: cmp r7, #1 ; CHECK-NEXT: vfma.f32 q3, q2, r2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vfma.f32 q3, q2, lr ; CHECK-NEXT: bne .LBB19_9 ; CHECK-NEXT: @ %bb.8: @ %if.then58 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: vstr s12, [r5] -; CHECK-NEXT: vmov.f32 s8, s24 -; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s1, s0 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s14, s12 -; CHECK-NEXT: b .LBB19_11 +; CHECK-NEXT: b .LBB19_12 ; CHECK-NEXT: .LBB19_9: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: cmp r7, #2 ; CHECK-NEXT: vstmia r5, {s12, s13} -; CHECK-NEXT: bne .LBB19_12 +; CHECK-NEXT: bne .LBB19_11 ; CHECK-NEXT: @ %bb.10: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s4, s0 ; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vmov.f32 s0, s24 ; CHECK-NEXT: vmov.f32 s7, s12 -; CHECK-NEXT: .LBB19_11: @ %if.end69 -; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: b .LBB19_2 -; CHECK-NEXT: .LBB19_12: @ %if.else64 +; CHECK-NEXT: b .LBB19_12 +; CHECK-NEXT: .LBB19_11: @ %if.else64 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: vmov.f32 s7, s13 -; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload -; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vstr s14, [r5, #8] -; CHECK-NEXT: vmov.f32 s8, s1 +; CHECK-NEXT: .LBB19_12: @ %if.end69 +; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 +; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: b .LBB19_2 ; CHECK-NEXT: .LBB19_13: @ %do.end -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -2026,8 +2020,8 @@ define void @arm_biquad_cascade_df2T_f32(%struct.arm_biquad_cascade_df2T_instanc ; CHECK-NEXT: b .LBB20_3 ; CHECK-NEXT: .LBB20_1: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vstr s4, [r12] ; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vstr s4, [r12] ; CHECK-NEXT: .LBB20_2: @ %if.end ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 ; CHECK-NEXT: vstr s6, [r12, #4] @@ -2209,9 +2203,9 @@ do.end: ; preds = %if.end define arm_aapcs_vfpcc float @vecAddAcrossF32Mve(<4 x float> %in) { ; CHECK-LABEL: vecAddAcrossF32Mve: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f32 s4, s0, s1 -; CHECK-NEXT: vadd.f32 s4, s4, s2 -; CHECK-NEXT: vadd.f32 s0, s4, s3 +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s0, s0, s3 ; CHECK-NEXT: bx lr entry: %0 = extractelement <4 x float> %in, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-fmas.ll b/llvm/test/CodeGen/Thumb2/mve-fmas.ll index a212158487286..b13a98666c728 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fmas.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmas.ll @@ -22,25 +22,25 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1(<8 x half> %src1, <8 x half> %src2, ; CHECK-MVE-NEXT: vmovx.f16 s14, s4 ; CHECK-MVE-NEXT: vmla.f16 s0, s4, s8 ; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vins.f16 s0, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s1 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 +; CHECK-MVE-NEXT: vmovx.f16 s12, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vmla.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmla.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vins.f16 s1, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s2 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s2 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vmla.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmla.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vins.f16 s2, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s3 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 +; CHECK-MVE-NEXT: vmovx.f16 s8, s3 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 ; CHECK-MVE-NEXT: vmla.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vins.f16 s3, s13 +; CHECK-MVE-NEXT: vmla.f16 s8, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s13 +; CHECK-MVE-NEXT: vins.f16 s2, s12 +; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <8 x half> %src2, %src3 @@ -67,25 +67,25 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2(<8 x half> %src1, <8 x half> %src2, ; CHECK-MVE-NEXT: vmovx.f16 s14, s4 ; CHECK-MVE-NEXT: vmla.f16 s0, s4, s8 ; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vins.f16 s0, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s1 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 +; CHECK-MVE-NEXT: vmovx.f16 s12, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vmla.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmla.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vins.f16 s1, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s2 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s2 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vmla.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmla.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vins.f16 s2, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s3 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 +; CHECK-MVE-NEXT: vmovx.f16 s8, s3 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 ; CHECK-MVE-NEXT: vmla.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vins.f16 s3, s13 +; CHECK-MVE-NEXT: vmla.f16 s8, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s13 +; CHECK-MVE-NEXT: vins.f16 s2, s12 +; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <8 x half> %src2, %src3 @@ -112,25 +112,25 @@ define arm_aapcs_vfpcc <8 x half> @vfms16(<8 x half> %src1, <8 x half> %src2, <8 ; CHECK-MVE-NEXT: vmovx.f16 s14, s4 ; CHECK-MVE-NEXT: vmls.f16 s0, s4, s8 ; CHECK-MVE-NEXT: vmls.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vins.f16 s0, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s1 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 +; CHECK-MVE-NEXT: vmovx.f16 s12, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vmls.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmls.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vmls.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vins.f16 s1, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s2 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s2 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vmls.f16 s12, s8, s4 ; CHECK-MVE-NEXT: vmls.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vmls.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vins.f16 s2, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s3 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 +; CHECK-MVE-NEXT: vmovx.f16 s8, s3 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 ; CHECK-MVE-NEXT: vmls.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vmls.f16 s13, s14, s12 -; CHECK-MVE-NEXT: vins.f16 s3, s13 +; CHECK-MVE-NEXT: vmls.f16 s8, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s13 +; CHECK-MVE-NEXT: vins.f16 s2, s12 +; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <8 x half> %src2, %src3 @@ -161,22 +161,22 @@ define arm_aapcs_vfpcc <8 x half> @vfmar16(<8 x half> %src1, <8 x half> %src2, f ; CHECK-MVE-NEXT: vmovx.f16 s10, s4 ; CHECK-MVE-NEXT: vmla.f16 s0, s4, s8 ; CHECK-MVE-NEXT: vmla.f16 s12, s10, s8 -; CHECK-MVE-NEXT: vmovx.f16 s10, s5 -; CHECK-MVE-NEXT: vins.f16 s0, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s1 -; CHECK-MVE-NEXT: vmla.f16 s12, s10, s8 +; CHECK-MVE-NEXT: vmovx.f16 s10, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmla.f16 s1, s5, s8 -; CHECK-MVE-NEXT: vins.f16 s1, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s2 -; CHECK-MVE-NEXT: vmovx.f16 s10, s6 +; CHECK-MVE-NEXT: vmla.f16 s10, s4, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s10 +; CHECK-MVE-NEXT: vmovx.f16 s10, s2 +; CHECK-MVE-NEXT: vmla.f16 s10, s4, s8 ; CHECK-MVE-NEXT: vmla.f16 s2, s6, s8 -; CHECK-MVE-NEXT: vmla.f16 s12, s10, s8 -; CHECK-MVE-NEXT: vmovx.f16 s10, s7 -; CHECK-MVE-NEXT: vins.f16 s2, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s3 -; CHECK-MVE-NEXT: vmla.f16 s12, s10, s8 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vmla.f16 s6, s4, s8 ; CHECK-MVE-NEXT: vmla.f16 s3, s7, s8 -; CHECK-MVE-NEXT: vins.f16 s3, s12 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s2, s10 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr entry: %src3 = fptrunc float %src3o to half @@ -207,33 +207,33 @@ define arm_aapcs_vfpcc <8 x half> @vfma16(<8 x half> %src1, <8 x half> %src2, fl ; CHECK-MVE: @ %bb.0: @ %entry ; CHECK-MVE-NEXT: vmov q3, q0 ; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s2, s12 ; CHECK-MVE-NEXT: vmov.f32 s8, s3 -; CHECK-MVE-NEXT: vmovx.f16 s10, s4 +; CHECK-MVE-NEXT: vmla.f16 s8, s2, s0 ; CHECK-MVE-NEXT: vmov.f32 s0, s3 -; CHECK-MVE-NEXT: vmovx.f16 s9, s12 -; CHECK-MVE-NEXT: vmla.f16 s8, s9, s10 ; CHECK-MVE-NEXT: vmla.f16 s0, s12, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmov.f32 s9, s3 ; CHECK-MVE-NEXT: vmov.f32 s1, s3 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vmovx.f16 s10, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s8 +; CHECK-MVE-NEXT: vmovx.f16 s2, s5 +; CHECK-MVE-NEXT: vmovx.f16 s4, s13 +; CHECK-MVE-NEXT: vmov.f32 s8, s3 +; CHECK-MVE-NEXT: vmla.f16 s8, s4, s2 ; CHECK-MVE-NEXT: vmla.f16 s1, s13, s5 -; CHECK-MVE-NEXT: vmla.f16 s9, s10, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vins.f16 s1, s9 -; CHECK-MVE-NEXT: vmov.f32 s9, s3 +; CHECK-MVE-NEXT: vins.f16 s1, s8 +; CHECK-MVE-NEXT: vmovx.f16 s2, s6 +; CHECK-MVE-NEXT: vmovx.f16 s4, s14 +; CHECK-MVE-NEXT: vmov.f32 s8, s3 +; CHECK-MVE-NEXT: vmla.f16 s8, s4, s2 ; CHECK-MVE-NEXT: vmov.f32 s2, s3 -; CHECK-MVE-NEXT: vmovx.f16 s10, s14 -; CHECK-MVE-NEXT: vmla.f16 s9, s10, s8 ; CHECK-MVE-NEXT: vmla.f16 s2, s14, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s9 -; CHECK-MVE-NEXT: vmov.f32 s9, s3 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vmovx.f16 s10, s15 -; CHECK-MVE-NEXT: vmla.f16 s9, s10, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vins.f16 s2, s8 +; CHECK-MVE-NEXT: vmov.f32 s8, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: vmla.f16 s3, s15, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s9 +; CHECK-MVE-NEXT: vmla.f16 s8, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr entry: %src3 = fptrunc float %src3o to half @@ -364,14 +364,13 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32(<4 x float> %src1, <4 x float> %src2 ; ; CHECK-MVE-LABEL: vfmas32: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: @ kill: def $s8 killed $s8 def $q2 ; CHECK-MVE-NEXT: vmov.f32 s11, s8 -; CHECK-MVE-NEXT: vmla.f32 s11, s3, s7 ; CHECK-MVE-NEXT: vmov.f32 s10, s8 -; CHECK-MVE-NEXT: vmla.f32 s10, s2, s6 ; CHECK-MVE-NEXT: vmov.f32 s9, s8 -; CHECK-MVE-NEXT: vmla.f32 s9, s1, s5 ; CHECK-MVE-NEXT: vmla.f32 s8, s0, s4 +; CHECK-MVE-NEXT: vmla.f32 s11, s3, s7 +; CHECK-MVE-NEXT: vmla.f32 s10, s2, s6 +; CHECK-MVE-NEXT: vmla.f32 s9, s1, s5 ; CHECK-MVE-NEXT: vmov q0, q2 ; CHECK-MVE-NEXT: bx lr entry: @@ -401,8 +400,6 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1_pred(<8 x half> %src1, <8 x half> % ; ; CHECK-MVE-LABEL: vfma16_v1_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} ; CHECK-MVE-NEXT: vmovx.f16 s14, s4 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s14, #0 @@ -417,98 +414,96 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1_pred(<8 x half> %src1, <8 x half> % ; CHECK-MVE-NEXT: vmla.f16 s15, s14, s12 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s12, s13, s15 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s14, s0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s12, s0 +; CHECK-MVE-NEXT: vmla.f16 s14, s4, s8 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s5 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s12, s4, s8 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s0, s12 +; CHECK-MVE-NEXT: vcmp.f16 s8, #0 +; CHECK-MVE-NEXT: vseleq.f16 s0, s0, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s1 +; CHECK-MVE-NEXT: vins.f16 s0, s12 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s12, s1 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s9 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmla.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 +; CHECK-MVE-NEXT: vmla.f16 s14, s8, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s5, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s1 +; CHECK-MVE-NEXT: vmov.f32 s8, s1 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s18, s5, s9 -; CHECK-MVE-NEXT: vseleq.f16 s13, s1, s18 -; CHECK-MVE-NEXT: vmovx.f16 s18, s6 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 +; CHECK-MVE-NEXT: vmla.f16 s8, s5, s9 +; CHECK-MVE-NEXT: vseleq.f16 s1, s1, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, #0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s2 +; CHECK-MVE-NEXT: vmovx.f16 s12, s2 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s10 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmla.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 +; CHECK-MVE-NEXT: vmla.f16 s14, s8, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s8, s2 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s2 +; CHECK-MVE-NEXT: vmla.f16 s8, s6, s10 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s18, s6, s10 -; CHECK-MVE-NEXT: vseleq.f16 s14, s2, s18 -; CHECK-MVE-NEXT: vmovx.f16 s18, s7 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 -; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s6, #0 +; CHECK-MVE-NEXT: vseleq.f16 s2, s2, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s3 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s3 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s11 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmla.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 +; CHECK-MVE-NEXT: vmov.f32 s10, s8 +; CHECK-MVE-NEXT: vmla.f16 s10, s6, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s7, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s10 ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s3 +; CHECK-MVE-NEXT: vmov.f32 s6, s3 ; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmla.f16 s18, s7, s11 +; CHECK-MVE-NEXT: vmla.f16 s6, s7, s11 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s3, s18 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s3, s6 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <8 x half> %src2, %src3 @@ -534,8 +529,6 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2_pred(<8 x half> %src1, <8 x half> % ; ; CHECK-MVE-LABEL: vfma16_v2_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} ; CHECK-MVE-NEXT: vmovx.f16 s14, s4 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s14, #0 @@ -550,98 +543,96 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2_pred(<8 x half> %src1, <8 x half> % ; CHECK-MVE-NEXT: vmla.f16 s15, s14, s12 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s12, s13, s15 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s14, s0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s12, s0 +; CHECK-MVE-NEXT: vmla.f16 s14, s4, s8 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s5 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s12, s4, s8 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s0, s12 +; CHECK-MVE-NEXT: vcmp.f16 s8, #0 +; CHECK-MVE-NEXT: vseleq.f16 s0, s0, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s1 +; CHECK-MVE-NEXT: vins.f16 s0, s12 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s12, s1 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s9 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmla.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 +; CHECK-MVE-NEXT: vmla.f16 s14, s8, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s5, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s1 +; CHECK-MVE-NEXT: vmov.f32 s8, s1 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s18, s5, s9 -; CHECK-MVE-NEXT: vseleq.f16 s13, s1, s18 -; CHECK-MVE-NEXT: vmovx.f16 s18, s6 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 +; CHECK-MVE-NEXT: vmla.f16 s8, s5, s9 +; CHECK-MVE-NEXT: vseleq.f16 s1, s1, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, #0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s2 +; CHECK-MVE-NEXT: vmovx.f16 s12, s2 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s10 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmla.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 +; CHECK-MVE-NEXT: vmla.f16 s14, s8, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s8, s2 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s2 +; CHECK-MVE-NEXT: vmla.f16 s8, s6, s10 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s18, s6, s10 -; CHECK-MVE-NEXT: vseleq.f16 s14, s2, s18 -; CHECK-MVE-NEXT: vmovx.f16 s18, s7 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 -; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s6, #0 +; CHECK-MVE-NEXT: vseleq.f16 s2, s2, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s3 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s3 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s11 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmla.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 +; CHECK-MVE-NEXT: vmov.f32 s10, s8 +; CHECK-MVE-NEXT: vmla.f16 s10, s6, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s7, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s10 ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s3 +; CHECK-MVE-NEXT: vmov.f32 s6, s3 ; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmla.f16 s18, s7, s11 +; CHECK-MVE-NEXT: vmla.f16 s6, s7, s11 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s3, s18 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s3, s6 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <8 x half> %src2, %src3 @@ -667,8 +658,6 @@ define arm_aapcs_vfpcc <8 x half> @vfms16_pred(<8 x half> %src1, <8 x half> %src ; ; CHECK-MVE-LABEL: vfms16_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} ; CHECK-MVE-NEXT: vmovx.f16 s14, s4 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s14, #0 @@ -683,98 +672,96 @@ define arm_aapcs_vfpcc <8 x half> @vfms16_pred(<8 x half> %src1, <8 x half> %src ; CHECK-MVE-NEXT: vmls.f16 s15, s14, s12 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s12, s13, s15 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s14, s0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s12, s0 +; CHECK-MVE-NEXT: vmls.f16 s14, s4, s8 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s5 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmls.f16 s12, s4, s8 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s0, s12 +; CHECK-MVE-NEXT: vcmp.f16 s8, #0 +; CHECK-MVE-NEXT: vseleq.f16 s0, s0, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s1 +; CHECK-MVE-NEXT: vins.f16 s0, s12 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s12, s1 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s9 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmls.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 +; CHECK-MVE-NEXT: vmls.f16 s14, s8, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s5, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s1 +; CHECK-MVE-NEXT: vmov.f32 s8, s1 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmls.f16 s18, s5, s9 -; CHECK-MVE-NEXT: vseleq.f16 s13, s1, s18 -; CHECK-MVE-NEXT: vmovx.f16 s18, s6 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 +; CHECK-MVE-NEXT: vmls.f16 s8, s5, s9 +; CHECK-MVE-NEXT: vseleq.f16 s1, s1, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, #0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s2 +; CHECK-MVE-NEXT: vmovx.f16 s12, s2 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s10 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmls.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 +; CHECK-MVE-NEXT: vmls.f16 s14, s8, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s8, s2 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s2 +; CHECK-MVE-NEXT: vmls.f16 s8, s6, s10 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmls.f16 s18, s6, s10 -; CHECK-MVE-NEXT: vseleq.f16 s14, s2, s18 -; CHECK-MVE-NEXT: vmovx.f16 s18, s7 -; CHECK-MVE-NEXT: vcmp.f16 s18, #0 -; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s6, #0 +; CHECK-MVE-NEXT: vseleq.f16 s2, s2, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s3 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s20, s3 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s11 -; CHECK-MVE-NEXT: vmov.f32 s22, s20 -; CHECK-MVE-NEXT: vmls.f16 s22, s18, s16 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 +; CHECK-MVE-NEXT: vmov.f32 s10, s8 +; CHECK-MVE-NEXT: vmls.f16 s10, s6, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s7, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s20, s22 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s10 ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmov.f32 s18, s3 +; CHECK-MVE-NEXT: vmov.f32 s6, s3 ; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmls.f16 s18, s7, s11 +; CHECK-MVE-NEXT: vmls.f16 s6, s7, s11 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s3, s18 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s3, s6 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <8 x half> %src2, %src3 @@ -805,108 +792,107 @@ define arm_aapcs_vfpcc <8 x half> @vfmar16_pred(<8 x half> %src1, <8 x half> %sr ; ; CHECK-MVE-LABEL: vfmar16_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcvtb.f16.f32 s12, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s4 -; CHECK-MVE-NEXT: vcmp.f16 s8, #0 +; CHECK-MVE-NEXT: vmovx.f16 s10, s4 ; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s10, #0 +; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s10, s0 -; CHECK-MVE-NEXT: vmov.f32 s14, s10 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s8, s8 +; CHECK-MVE-NEXT: vmov.f32 s14, s12 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s14, s8, s12 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmla.f16 s14, s10, s8 ; CHECK-MVE-NEXT: vcmp.f16 s4, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s14 +; CHECK-MVE-NEXT: vseleq.f16 s10, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s8, s0 +; CHECK-MVE-NEXT: vmla.f16 s12, s4, s8 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s8, s4, s12 -; CHECK-MVE-NEXT: vseleq.f16 s8, s0, s8 -; CHECK-MVE-NEXT: movs r1, #0 -; CHECK-MVE-NEXT: vins.f16 s8, s14 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vcmp.f16 s14, #0 -; CHECK-MVE-NEXT: vmovx.f16 s13, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vseleq.f16 s0, s0, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vins.f16 s0, s10 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s13 +; CHECK-MVE-NEXT: vmovx.f16 s10, s1 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s15, s14, s12 +; CHECK-MVE-NEXT: vmov.f32 s12, s10 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmla.f16 s12, s4, s8 ; CHECK-MVE-NEXT: vcmp.f16 s5, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s4, s10, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s1 +; CHECK-MVE-NEXT: vmov.f32 s10, s1 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s13, s5, s12 +; CHECK-MVE-NEXT: vmla.f16 s10, s5, s8 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s9, s1, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s2 -; CHECK-MVE-NEXT: vins.f16 s9, s14 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vcmp.f16 s14, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s13 +; CHECK-MVE-NEXT: vseleq.f16 s1, s1, s10 +; CHECK-MVE-NEXT: vmovx.f16 s10, s2 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s15, s14, s12 +; CHECK-MVE-NEXT: vmla.f16 s12, s4, s8 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s6, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s4, s10, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s2 +; CHECK-MVE-NEXT: vmov.f32 s10, s2 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmla.f16 s10, s6, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s13, s6, s12 -; CHECK-MVE-NEXT: vseleq.f16 s10, s2, s13 +; CHECK-MVE-NEXT: vseleq.f16 s2, s2, s10 ; CHECK-MVE-NEXT: movs r1, #0 -; CHECK-MVE-NEXT: vins.f16 s10, s14 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vcmp.f16 s14, #0 -; CHECK-MVE-NEXT: vmovx.f16 s13, s3 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s13 +; CHECK-MVE-NEXT: vmov.f32 s10, s6 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s15, s14, s12 +; CHECK-MVE-NEXT: vmla.f16 s10, s4, s8 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s7, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s10 ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s3 +; CHECK-MVE-NEXT: vmov.f32 s6, s3 ; CHECK-MVE-NEXT: cset r0, ne -; CHECK-MVE-NEXT: vmla.f16 s13, s7, s12 +; CHECK-MVE-NEXT: vmla.f16 s6, s7, s8 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s11, s3, s13 -; CHECK-MVE-NEXT: vins.f16 s11, s14 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vseleq.f16 s3, s3, s6 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr entry: %src3 = fptrunc float %src3o to half @@ -942,104 +928,103 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_pred(<8 x half> %src1, <8 x half> %src ; CHECK-MVE-NEXT: vmovx.f16 s10, s4 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s10, #0 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s12, s8 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s8, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s14, s0 -; CHECK-MVE-NEXT: vmov.f32 s8, s12 +; CHECK-MVE-NEXT: vmovx.f16 s12, s0 +; CHECK-MVE-NEXT: vmov.f32 s14, s8 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s8, s14, s10 +; CHECK-MVE-NEXT: vmla.f16 s14, s12, s10 ; CHECK-MVE-NEXT: vcmp.f16 s4, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s14, s8 +; CHECK-MVE-NEXT: vseleq.f16 s10, s12, s14 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s8, s12 +; CHECK-MVE-NEXT: vmla.f16 s12, s0, s4 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s8, s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s8, s0, s8 -; CHECK-MVE-NEXT: movs r1, #0 -; CHECK-MVE-NEXT: vins.f16 s8, s14 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vcmp.f16 s14, #0 -; CHECK-MVE-NEXT: vmovx.f16 s13, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vseleq.f16 s0, s0, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: mov.w r1, #0 +; CHECK-MVE-NEXT: vins.f16 s0, s10 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s12 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s15, s13, s14 +; CHECK-MVE-NEXT: vmovx.f16 s10, s1 +; CHECK-MVE-NEXT: vmov.f32 s12, s8 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmla.f16 s12, s10, s4 ; CHECK-MVE-NEXT: vcmp.f16 s5, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s4, s10, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s12 +; CHECK-MVE-NEXT: vmov.f32 s10, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s13, s1, s5 +; CHECK-MVE-NEXT: vmla.f16 s10, s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s9, s1, s13 -; CHECK-MVE-NEXT: vmovx.f16 s13, s2 -; CHECK-MVE-NEXT: vins.f16 s9, s14 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vcmp.f16 s14, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s12 +; CHECK-MVE-NEXT: vseleq.f16 s1, s1, s10 +; CHECK-MVE-NEXT: vmovx.f16 s10, s2 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s15, s13, s14 +; CHECK-MVE-NEXT: vmla.f16 s12, s10, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s6, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s4, s10, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s12 +; CHECK-MVE-NEXT: vmov.f32 s10, s8 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmla.f16 s10, s2, s6 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmla.f16 s13, s2, s6 -; CHECK-MVE-NEXT: vseleq.f16 s10, s2, s13 +; CHECK-MVE-NEXT: vseleq.f16 s2, s2, s10 ; CHECK-MVE-NEXT: movs r1, #0 -; CHECK-MVE-NEXT: vins.f16 s10, s14 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vcmp.f16 s14, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s12 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmov.f32 s10, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s13, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmla.f16 s15, s13, s14 +; CHECK-MVE-NEXT: vmla.f16 s10, s6, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s7, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s13, s15 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s10 ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmla.f16 s12, s3, s7 +; CHECK-MVE-NEXT: vmla.f16 s8, s3, s7 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s11, s3, s12 -; CHECK-MVE-NEXT: vins.f16 s11, s14 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vseleq.f16 s3, s3, s8 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr entry: %src3 = fptrunc float %src3o to half @@ -1068,51 +1053,50 @@ define arm_aapcs_vfpcc <4 x float> @vfma32_v1_pred(<4 x float> %src1, <4 x float ; ; CHECK-MVE-LABEL: vfma32_v1_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: vmov.f32 s14, s0 ; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s1 +; CHECK-MVE-NEXT: movs r2, #0 +; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f32 s4, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: mov.w r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s12, s1 +; CHECK-MVE-NEXT: vmla.f32 s14, s4, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r2, #1 -; CHECK-MVE-NEXT: vmov.f32 s14, s0 +; CHECK-MVE-NEXT: vmov.f32 s8, s3 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s2 +; CHECK-MVE-NEXT: vmla.f32 s12, s5, s9 ; CHECK-MVE-NEXT: cset r2, ne +; CHECK-MVE-NEXT: vmov.f32 s5, s2 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vcmp.f32 s7, #0 -; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r3, #1 ; CHECK-MVE-NEXT: cmp r3, #0 ; CHECK-MVE-NEXT: vcmp.f32 s6, #0 ; CHECK-MVE-NEXT: cset r3, ne -; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmla.f32 s13, s7, s11 ; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmla.f32 s8, s7, s11 ; CHECK-MVE-NEXT: cmp r3, #0 -; CHECK-MVE-NEXT: vmla.f32 s12, s5, s9 -; CHECK-MVE-NEXT: vmla.f32 s14, s4, s8 -; CHECK-MVE-NEXT: vmla.f32 s15, s6, s10 -; CHECK-MVE-NEXT: vseleq.f32 s7, s3, s13 +; CHECK-MVE-NEXT: vmla.f32 s5, s6, s10 +; CHECK-MVE-NEXT: vseleq.f32 s3, s3, s8 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f32 s6, s2, s15 +; CHECK-MVE-NEXT: vseleq.f32 s2, s2, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f32 s5, s1, s12 +; CHECK-MVE-NEXT: vseleq.f32 s1, s1, s12 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vseleq.f32 s4, s0, s14 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vseleq.f32 s0, s0, s14 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <4 x float> %src2, %src3 @@ -1138,51 +1122,50 @@ define arm_aapcs_vfpcc <4 x float> @vfma32_v2_pred(<4 x float> %src1, <4 x float ; ; CHECK-MVE-LABEL: vfma32_v2_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: vmov.f32 s14, s0 ; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s1 +; CHECK-MVE-NEXT: movs r2, #0 +; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f32 s4, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: mov.w r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s12, s1 +; CHECK-MVE-NEXT: vmla.f32 s14, s4, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r2, #1 -; CHECK-MVE-NEXT: vmov.f32 s14, s0 +; CHECK-MVE-NEXT: vmov.f32 s8, s3 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s2 +; CHECK-MVE-NEXT: vmla.f32 s12, s5, s9 ; CHECK-MVE-NEXT: cset r2, ne +; CHECK-MVE-NEXT: vmov.f32 s5, s2 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vcmp.f32 s7, #0 -; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r3, #1 ; CHECK-MVE-NEXT: cmp r3, #0 ; CHECK-MVE-NEXT: vcmp.f32 s6, #0 ; CHECK-MVE-NEXT: cset r3, ne -; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmla.f32 s13, s7, s11 ; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmla.f32 s8, s7, s11 ; CHECK-MVE-NEXT: cmp r3, #0 -; CHECK-MVE-NEXT: vmla.f32 s12, s5, s9 -; CHECK-MVE-NEXT: vmla.f32 s14, s4, s8 -; CHECK-MVE-NEXT: vmla.f32 s15, s6, s10 -; CHECK-MVE-NEXT: vseleq.f32 s7, s3, s13 +; CHECK-MVE-NEXT: vmla.f32 s5, s6, s10 +; CHECK-MVE-NEXT: vseleq.f32 s3, s3, s8 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f32 s6, s2, s15 +; CHECK-MVE-NEXT: vseleq.f32 s2, s2, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f32 s5, s1, s12 +; CHECK-MVE-NEXT: vseleq.f32 s1, s1, s12 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vseleq.f32 s4, s0, s14 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vseleq.f32 s0, s0, s14 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <4 x float> %src2, %src3 @@ -1208,51 +1191,50 @@ define arm_aapcs_vfpcc <4 x float> @vfms32_pred(<4 x float> %src1, <4 x float> % ; ; CHECK-MVE-LABEL: vfms32_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: vmov.f32 s14, s0 ; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s1 +; CHECK-MVE-NEXT: movs r2, #0 +; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f32 s4, #0 -; CHECK-MVE-NEXT: vmov.f32 s13, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: mov.w r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s12, s1 +; CHECK-MVE-NEXT: vmls.f32 s14, s4, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r2, #1 -; CHECK-MVE-NEXT: vmov.f32 s14, s0 +; CHECK-MVE-NEXT: vmov.f32 s8, s3 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s15, s2 +; CHECK-MVE-NEXT: vmls.f32 s12, s5, s9 ; CHECK-MVE-NEXT: cset r2, ne +; CHECK-MVE-NEXT: vmov.f32 s5, s2 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vcmp.f32 s7, #0 -; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r3, #1 ; CHECK-MVE-NEXT: cmp r3, #0 ; CHECK-MVE-NEXT: vcmp.f32 s6, #0 ; CHECK-MVE-NEXT: cset r3, ne -; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmls.f32 s13, s7, s11 ; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmls.f32 s8, s7, s11 ; CHECK-MVE-NEXT: cmp r3, #0 -; CHECK-MVE-NEXT: vmls.f32 s12, s5, s9 -; CHECK-MVE-NEXT: vmls.f32 s14, s4, s8 -; CHECK-MVE-NEXT: vmls.f32 s15, s6, s10 -; CHECK-MVE-NEXT: vseleq.f32 s7, s3, s13 +; CHECK-MVE-NEXT: vmls.f32 s5, s6, s10 +; CHECK-MVE-NEXT: vseleq.f32 s3, s3, s8 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f32 s6, s2, s15 +; CHECK-MVE-NEXT: vseleq.f32 s2, s2, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f32 s5, s1, s12 +; CHECK-MVE-NEXT: vseleq.f32 s1, s1, s12 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vseleq.f32 s4, s0, s14 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vseleq.f32 s0, s0, s14 ; CHECK-MVE-NEXT: bx lr entry: %0 = fmul <4 x float> %src2, %src3 @@ -1281,8 +1263,10 @@ define arm_aapcs_vfpcc <4 x float> @vfmar32_pred(<4 x float> %src1, <4 x float> ; ; CHECK-MVE-LABEL: vfmar32_pred: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: vmov.f32 s10, s1 ; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vcmp.f32 s5, #0 +; CHECK-MVE-NEXT: movs r2, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -1291,17 +1275,16 @@ define arm_aapcs_vfpcc <4 x float> @vfmar32_pred(<4 x float> %src1, <4 x float> ; CHECK-MVE-NEXT: vmov.f32 s14, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-MVE-NEXT: mov.w r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s10, s1 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r2, #1 -; CHECK-MVE-NEXT: vmov.f32 s12, s0 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s9, s2 +; CHECK-MVE-NEXT: vmla.f32 s10, s5, s8 +; CHECK-MVE-NEXT: vmov.f32 s5, s2 ; CHECK-MVE-NEXT: cset r2, ne ; CHECK-MVE-NEXT: vcmp.f32 s7, #0 ; CHECK-MVE-NEXT: movs r3, #0 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-MVE-NEXT: vmov.f32 s12, s0 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r3, #1 ; CHECK-MVE-NEXT: cmp r3, #0 @@ -1312,20 +1295,18 @@ define arm_aapcs_vfpcc <4 x float> @vfmar32_pred(<4 x float> %src1, <4 x float> ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vmla.f32 s14, s7, s8 ; CHECK-MVE-NEXT: cset r0, ne +; CHECK-MVE-NEXT: vmla.f32 s14, s7, s8 ; CHECK-MVE-NEXT: cmp r3, #0 -; CHECK-MVE-NEXT: vmla.f32 s10, s5, s8 -; CHECK-MVE-NEXT: vmla.f32 s12, s4, s8 -; CHECK-MVE-NEXT: vmla.f32 s9, s6, s8 -; CHECK-MVE-NEXT: vseleq.f32 s7, s3, s14 +; CHECK-MVE-NEXT: vmla.f32 s5, s6, s8 +; CHECK-MVE-NEXT: vseleq.f32 s3, s3, s14 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f32 s6, s2, s9 +; CHECK-MVE-NEXT: vseleq.f32 s2, s2, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f32 s5, s1, s10 +; CHECK-MVE-NEXT: vseleq.f32 s1, s1, s10 +; CHECK-MVE-NEXT: vmla.f32 s12, s4, s8 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vseleq.f32 s4, s0, s12 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vseleq.f32 s0, s0, s12 ; CHECK-MVE-NEXT: bx lr entry: %i = insertelement <4 x float> undef, float %src3, i32 0 @@ -1366,15 +1347,15 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32_pred(<4 x float> %src1, <4 x float> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r2, #0 -; CHECK-MVE-NEXT: vmov.f32 s10, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r2, #1 -; CHECK-MVE-NEXT: vmov.f32 s12, s8 ; CHECK-MVE-NEXT: cmp r2, #0 ; CHECK-MVE-NEXT: vcmp.f32 s7, #0 ; CHECK-MVE-NEXT: cset r2, ne +; CHECK-MVE-NEXT: vmov.f32 s10, s8 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r3, #0 +; CHECK-MVE-NEXT: vmov.f32 s12, s8 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r3, #1 ; CHECK-MVE-NEXT: cmp r3, #0 @@ -1388,17 +1369,16 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32_pred(<4 x float> %src1, <4 x float> ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: vmla.f32 s14, s3, s7 ; CHECK-MVE-NEXT: cmp r3, #0 -; CHECK-MVE-NEXT: vmla.f32 s10, s1, s5 -; CHECK-MVE-NEXT: vmla.f32 s12, s0, s4 ; CHECK-MVE-NEXT: vmla.f32 s8, s2, s6 -; CHECK-MVE-NEXT: vseleq.f32 s7, s3, s14 +; CHECK-MVE-NEXT: vseleq.f32 s3, s3, s14 ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f32 s6, s2, s8 +; CHECK-MVE-NEXT: vseleq.f32 s2, s2, s8 +; CHECK-MVE-NEXT: vmla.f32 s10, s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f32 s5, s1, s10 +; CHECK-MVE-NEXT: vmla.f32 s12, s0, s4 +; CHECK-MVE-NEXT: vseleq.f32 s1, s1, s10 ; CHECK-MVE-NEXT: cmp r2, #0 -; CHECK-MVE-NEXT: vseleq.f32 s4, s0, s12 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vseleq.f32 s0, s0, s12 ; CHECK-MVE-NEXT: bx lr entry: %i = insertelement <4 x float> undef, float %src3, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll index d75025c012072..cfda5a737e886 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll @@ -5,11 +5,10 @@ define arm_aapcs_vfpcc <4 x float> @sqrt_float32_t(<4 x float> %src) { ; CHECK-LABEL: sqrt_float32_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vsqrt.f32 s7, s3 -; CHECK-NEXT: vsqrt.f32 s6, s2 -; CHECK-NEXT: vsqrt.f32 s5, s1 -; CHECK-NEXT: vsqrt.f32 s4, s0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vsqrt.f32 s3, s3 +; CHECK-NEXT: vsqrt.f32 s2, s2 +; CHECK-NEXT: vsqrt.f32 s1, s1 +; CHECK-NEXT: vsqrt.f32 s0, s0 ; CHECK-NEXT: bx lr entry: %0 = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %src) @@ -19,23 +18,22 @@ entry: define arm_aapcs_vfpcc <8 x half> @sqrt_float16_t(<8 x half> %src) { ; CHECK-LABEL: sqrt_float16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vsqrt.f16 s8, s0 -; CHECK-NEXT: vsqrt.f16 s0, s4 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vsqrt.f16 s8, s8 -; CHECK-NEXT: vsqrt.f16 s1, s5 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vsqrt.f16 s8, s8 -; CHECK-NEXT: vsqrt.f16 s2, s6 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vsqrt.f16 s8, s8 -; CHECK-NEXT: vsqrt.f16 s3, s7 -; CHECK-NEXT: vins.f16 s3, s8 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vsqrt.f16 s0, s0 +; CHECK-NEXT: vsqrt.f16 s4, s4 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vsqrt.f16 s4, s4 +; CHECK-NEXT: vsqrt.f16 s1, s1 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vsqrt.f16 s4, s4 +; CHECK-NEXT: vsqrt.f16 s2, s2 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vsqrt.f16 s4, s4 +; CHECK-NEXT: vsqrt.f16 s3, s3 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: %0 = call fast <8 x half> @llvm.sqrt.v8f16(<8 x half> %src) @@ -101,52 +99,52 @@ define arm_aapcs_vfpcc <8 x half> @cos_float16_t(<8 x half> %src) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.cos.v8f16(<8 x half> %src) @@ -212,52 +210,52 @@ define arm_aapcs_vfpcc <8 x half> @sin_float16_t(<8 x half> %src) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.sin.v8f16(<8 x half> %src) @@ -323,52 +321,52 @@ define arm_aapcs_vfpcc <8 x half> @exp_float16_t(<8 x half> %src) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.exp.v8f16(<8 x half> %src) @@ -434,52 +432,52 @@ define arm_aapcs_vfpcc <8 x half> @exp2_float16_t(<8 x half> %src) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.exp2.v8f16(<8 x half> %src) @@ -545,52 +543,52 @@ define arm_aapcs_vfpcc <8 x half> @log_float16_t(<8 x half> %src) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.log.v8f16(<8 x half> %src) @@ -656,52 +654,52 @@ define arm_aapcs_vfpcc <8 x half> @log2_float16_t(<8 x half> %src) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.log2.v8f16(<8 x half> %src) @@ -767,52 +765,52 @@ define arm_aapcs_vfpcc <8 x half> @log10_float16_t(<8 x half> %src) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s16 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 -; CHECK-NEXT: vmov s20, r0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s20, s20 -; CHECK-NEXT: vcvtt.f16.f32 s20, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s21, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s21, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s22, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s22, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s23, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s23, s0 -; CHECK-NEXT: vmov q0, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.log10.v8f16(<8 x half> %src) @@ -883,8 +881,8 @@ define arm_aapcs_vfpcc <8 x half> @pow_float16_t(<8 x half> %src1, <8 x half> %s ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q5, q0 ; CHECK-NEXT: vmov q4, q1 ; CHECK-NEXT: vcvtb.f32.f16 s0, s20 @@ -893,59 +891,59 @@ define arm_aapcs_vfpcc <8 x half> @pow_float16_t(<8 x half> %src1, <8 x half> %s ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vcvtt.f32.f16 s0, s20 -; CHECK-NEXT: vmov s24, r0 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov s16, r0 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s24, s24 -; CHECK-NEXT: vcvtt.f16.f32 s24, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s16 +; CHECK-NEXT: vcvtt.f16.f32 s16, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s21 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s25, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s21 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s17 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s25, s0 +; CHECK-NEXT: vcvtt.f16.f32 s17, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s22 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s26, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s22 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s18 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s26, s0 +; CHECK-NEXT: vcvtt.f16.f32 s18, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s23 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtb.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtb.f16.f32 s27, s0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s23 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vcvtt.f32.f16 s0, s19 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vcvtt.f16.f32 s27, s0 -; CHECK-NEXT: vmov q0, q6 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vcvtt.f16.f32 s19, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} entry: %0 = call fast <8 x half> @llvm.pow.v8f16(<8 x half> %src1, <8 x half> %src2) @@ -996,8 +994,8 @@ define arm_aapcs_vfpcc <4 x float> @copysign_float32_t(<4 x float> %src1, <4 x f ; CHECK-NEXT: bfi r5, r1, #31, #1 ; CHECK-NEXT: lsr.w r1, r12, #31 ; CHECK-NEXT: bfi r3, r1, #31, #1 -; CHECK-NEXT: vmov s3, r5 ; CHECK-NEXT: vmov s2, r4 +; CHECK-NEXT: vmov s3, r5 ; CHECK-NEXT: vmov s1, r0 ; CHECK-NEXT: vmov s0, r3 ; CHECK-NEXT: pop {r4, r5, r7, pc} @@ -1013,81 +1011,80 @@ define arm_aapcs_vfpcc <8 x half> @copysign_float16_t(<8 x half> %src1, <8 x hal ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmovx.f16 s8, s4 ; CHECK-NEXT: vstr.16 s8, [sp, #24] -; CHECK-NEXT: vmovx.f16 s8, s5 ; CHECK-NEXT: vstr.16 s4, [sp, #28] -; CHECK-NEXT: vstr.16 s8, [sp, #16] -; CHECK-NEXT: vmovx.f16 s8, s6 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vstr.16 s4, [sp, #16] +; CHECK-NEXT: vmovx.f16 s4, s6 ; CHECK-NEXT: vstr.16 s5, [sp, #20] -; CHECK-NEXT: vstr.16 s8, [sp, #8] -; CHECK-NEXT: vmovx.f16 s8, s7 +; CHECK-NEXT: vstr.16 s4, [sp, #8] +; CHECK-NEXT: vmovx.f16 s4, s7 ; CHECK-NEXT: vstr.16 s6, [sp, #12] -; CHECK-NEXT: vstr.16 s8, [sp] +; CHECK-NEXT: vstr.16 s4, [sp] ; CHECK-NEXT: vstr.16 s7, [sp, #4] -; CHECK-NEXT: vmovx.f16 s4, s0 ; CHECK-NEXT: ldrb.w r0, [sp, #25] +; CHECK-NEXT: vmovx.f16 s4, s0 ; CHECK-NEXT: vabs.f16 s4, s4 -; CHECK-NEXT: vneg.f16 s6, s4 +; CHECK-NEXT: vabs.f16 s0, s0 ; CHECK-NEXT: tst.w r0, #128 +; CHECK-NEXT: vneg.f16 s6, s4 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #29] -; CHECK-NEXT: vseleq.f16 s8, s4, s6 -; CHECK-NEXT: vabs.f16 s4, s0 +; CHECK-NEXT: vseleq.f16 s4, s4, s6 +; CHECK-NEXT: vneg.f16 s6, s0 ; CHECK-NEXT: tst.w r0, #128 -; CHECK-NEXT: vneg.f16 s6, s4 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vabs.f16 s0, s3 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #17] -; CHECK-NEXT: vseleq.f16 s4, s4, s6 +; CHECK-NEXT: vseleq.f16 s0, s0, s6 ; CHECK-NEXT: tst.w r0, #128 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vabs.f16 s8, s8 +; CHECK-NEXT: vabs.f16 s4, s4 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #21] -; CHECK-NEXT: vneg.f16 s10, s8 -; CHECK-NEXT: vseleq.f16 s8, s8, s10 -; CHECK-NEXT: vabs.f16 s10, s1 +; CHECK-NEXT: vneg.f16 s6, s4 +; CHECK-NEXT: vseleq.f16 s4, s4, s6 +; CHECK-NEXT: vabs.f16 s6, s1 ; CHECK-NEXT: tst.w r0, #128 -; CHECK-NEXT: vneg.f16 s12, s10 +; CHECK-NEXT: vneg.f16 s8, s6 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #9] -; CHECK-NEXT: vseleq.f16 s5, s10, s12 +; CHECK-NEXT: vseleq.f16 s1, s6, s8 ; CHECK-NEXT: tst.w r0, #128 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s4, s2 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vabs.f16 s8, s8 +; CHECK-NEXT: vabs.f16 s4, s4 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #13] -; CHECK-NEXT: vneg.f16 s10, s8 -; CHECK-NEXT: vseleq.f16 s8, s8, s10 -; CHECK-NEXT: vabs.f16 s10, s2 +; CHECK-NEXT: vneg.f16 s6, s4 +; CHECK-NEXT: vseleq.f16 s4, s4, s6 +; CHECK-NEXT: vabs.f16 s2, s2 ; CHECK-NEXT: tst.w r0, #128 -; CHECK-NEXT: vneg.f16 s12, s10 +; CHECK-NEXT: vneg.f16 s6, s2 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vneg.f16 s2, s0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #1] -; CHECK-NEXT: vseleq.f16 s6, s10, s12 +; CHECK-NEXT: vseleq.f16 s2, s2, s6 ; CHECK-NEXT: tst.w r0, #128 -; CHECK-NEXT: vins.f16 s6, s8 -; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: cset r0, ne -; CHECK-NEXT: vabs.f16 s8, s8 +; CHECK-NEXT: vabs.f16 s4, s4 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldrb.w r0, [sp, #5] -; CHECK-NEXT: vneg.f16 s10, s8 -; CHECK-NEXT: vseleq.f16 s8, s8, s10 +; CHECK-NEXT: vneg.f16 s6, s4 +; CHECK-NEXT: vseleq.f16 s4, s4, s6 +; CHECK-NEXT: vabs.f16 s6, s3 ; CHECK-NEXT: tst.w r0, #128 +; CHECK-NEXT: vneg.f16 s8, s6 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vseleq.f16 s7, s0, s2 -; CHECK-NEXT: vins.f16 s7, s8 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vseleq.f16 s3, s6, s8 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll index 1ec651ac06de8..0e993f35ce85d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll @@ -5,23 +5,22 @@ define arm_aapcs_vfpcc <8 x half> @fneg_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: fneg_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vneg.f16 s8, s0 -; CHECK-MVE-NEXT: vneg.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vneg.f16 s8, s8 -; CHECK-MVE-NEXT: vneg.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vneg.f16 s8, s8 -; CHECK-MVE-NEXT: vneg.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vneg.f16 s8, s8 -; CHECK-MVE-NEXT: vneg.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vneg.f16 s0, s0 +; CHECK-MVE-NEXT: vneg.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vneg.f16 s4, s4 +; CHECK-MVE-NEXT: vneg.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vneg.f16 s4, s4 +; CHECK-MVE-NEXT: vneg.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vneg.f16 s4, s4 +; CHECK-MVE-NEXT: vneg.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fneg_float16_t: @@ -36,11 +35,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @fneg_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: fneg_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vneg.f32 s7, s3 -; CHECK-MVE-NEXT: vneg.f32 s6, s2 -; CHECK-MVE-NEXT: vneg.f32 s5, s1 -; CHECK-MVE-NEXT: vneg.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vneg.f32 s3, s3 +; CHECK-MVE-NEXT: vneg.f32 s2, s2 +; CHECK-MVE-NEXT: vneg.f32 s1, s1 +; CHECK-MVE-NEXT: vneg.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fneg_float32_t: @@ -77,23 +75,22 @@ entry: define arm_aapcs_vfpcc <8 x half> @fabs_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: fabs_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vabs.f16 s8, s0 -; CHECK-MVE-NEXT: vabs.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vabs.f16 s8, s8 -; CHECK-MVE-NEXT: vabs.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vabs.f16 s8, s8 -; CHECK-MVE-NEXT: vabs.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vabs.f16 s8, s8 -; CHECK-MVE-NEXT: vabs.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vabs.f16 s0, s0 +; CHECK-MVE-NEXT: vabs.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vabs.f16 s4, s4 +; CHECK-MVE-NEXT: vabs.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vabs.f16 s4, s4 +; CHECK-MVE-NEXT: vabs.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vabs.f16 s4, s4 +; CHECK-MVE-NEXT: vabs.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fabs_float16_t: @@ -108,11 +105,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @fabs_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: fabs_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vabs.f32 s7, s3 -; CHECK-MVE-NEXT: vabs.f32 s6, s2 -; CHECK-MVE-NEXT: vabs.f32 s5, s1 -; CHECK-MVE-NEXT: vabs.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vabs.f32 s3, s3 +; CHECK-MVE-NEXT: vabs.f32 s2, s2 +; CHECK-MVE-NEXT: vabs.f32 s1, s1 +; CHECK-MVE-NEXT: vabs.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fabs_float32_t: diff --git a/llvm/test/CodeGen/Thumb2/mve-frint.ll b/llvm/test/CodeGen/Thumb2/mve-frint.ll index 8cfc1418d4d9b..1d7dcc8bf8440 100644 --- a/llvm/test/CodeGen/Thumb2/mve-frint.ll +++ b/llvm/test/CodeGen/Thumb2/mve-frint.ll @@ -5,11 +5,10 @@ define arm_aapcs_vfpcc <4 x float> @fceil_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: fceil_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vrintp.f32 s7, s3 -; CHECK-MVE-NEXT: vrintp.f32 s6, s2 -; CHECK-MVE-NEXT: vrintp.f32 s5, s1 -; CHECK-MVE-NEXT: vrintp.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vrintp.f32 s3, s3 +; CHECK-MVE-NEXT: vrintp.f32 s2, s2 +; CHECK-MVE-NEXT: vrintp.f32 s1, s1 +; CHECK-MVE-NEXT: vrintp.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fceil_float32_t: @@ -24,23 +23,22 @@ entry: define arm_aapcs_vfpcc <8 x half> @fceil_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: fceil_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vrintp.f16 s8, s0 -; CHECK-MVE-NEXT: vrintp.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vrintp.f16 s8, s8 -; CHECK-MVE-NEXT: vrintp.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vrintp.f16 s8, s8 -; CHECK-MVE-NEXT: vrintp.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vrintp.f16 s8, s8 -; CHECK-MVE-NEXT: vrintp.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vrintp.f16 s0, s0 +; CHECK-MVE-NEXT: vrintp.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vrintp.f16 s4, s4 +; CHECK-MVE-NEXT: vrintp.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vrintp.f16 s4, s4 +; CHECK-MVE-NEXT: vrintp.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vrintp.f16 s4, s4 +; CHECK-MVE-NEXT: vrintp.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fceil_float16_t: @@ -79,11 +77,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @ftrunc_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: ftrunc_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vrintz.f32 s7, s3 -; CHECK-MVE-NEXT: vrintz.f32 s6, s2 -; CHECK-MVE-NEXT: vrintz.f32 s5, s1 -; CHECK-MVE-NEXT: vrintz.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vrintz.f32 s3, s3 +; CHECK-MVE-NEXT: vrintz.f32 s2, s2 +; CHECK-MVE-NEXT: vrintz.f32 s1, s1 +; CHECK-MVE-NEXT: vrintz.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: ftrunc_float32_t: @@ -98,23 +95,22 @@ entry: define arm_aapcs_vfpcc <8 x half> @ftrunc_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: ftrunc_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vrintz.f16 s8, s0 -; CHECK-MVE-NEXT: vrintz.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vrintz.f16 s8, s8 -; CHECK-MVE-NEXT: vrintz.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vrintz.f16 s8, s8 -; CHECK-MVE-NEXT: vrintz.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vrintz.f16 s8, s8 -; CHECK-MVE-NEXT: vrintz.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vrintz.f16 s0, s0 +; CHECK-MVE-NEXT: vrintz.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vrintz.f16 s4, s4 +; CHECK-MVE-NEXT: vrintz.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vrintz.f16 s4, s4 +; CHECK-MVE-NEXT: vrintz.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vrintz.f16 s4, s4 +; CHECK-MVE-NEXT: vrintz.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: ftrunc_float16_t: @@ -153,11 +149,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @frint_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: frint_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vrintx.f32 s7, s3 -; CHECK-MVE-NEXT: vrintx.f32 s6, s2 -; CHECK-MVE-NEXT: vrintx.f32 s5, s1 -; CHECK-MVE-NEXT: vrintx.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vrintx.f32 s3, s3 +; CHECK-MVE-NEXT: vrintx.f32 s2, s2 +; CHECK-MVE-NEXT: vrintx.f32 s1, s1 +; CHECK-MVE-NEXT: vrintx.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: frint_float32_t: @@ -172,23 +167,22 @@ entry: define arm_aapcs_vfpcc <8 x half> @frint_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: frint_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vrintx.f16 s8, s0 -; CHECK-MVE-NEXT: vrintx.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vrintx.f16 s8, s8 -; CHECK-MVE-NEXT: vrintx.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vrintx.f16 s8, s8 -; CHECK-MVE-NEXT: vrintx.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vrintx.f16 s8, s8 -; CHECK-MVE-NEXT: vrintx.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vrintx.f16 s0, s0 +; CHECK-MVE-NEXT: vrintx.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vrintx.f16 s4, s4 +; CHECK-MVE-NEXT: vrintx.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vrintx.f16 s4, s4 +; CHECK-MVE-NEXT: vrintx.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vrintx.f16 s4, s4 +; CHECK-MVE-NEXT: vrintx.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: frint_float16_t: @@ -227,11 +221,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @fnearbyint_float32_t(<4 x float> %src) { ; CHECK-LABEL: fnearbyint_float32_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vrintr.f32 s7, s3 -; CHECK-NEXT: vrintr.f32 s6, s2 -; CHECK-NEXT: vrintr.f32 s5, s1 -; CHECK-NEXT: vrintr.f32 s4, s0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vrintr.f32 s3, s3 +; CHECK-NEXT: vrintr.f32 s2, s2 +; CHECK-NEXT: vrintr.f32 s1, s1 +; CHECK-NEXT: vrintr.f32 s0, s0 ; CHECK-NEXT: bx lr entry: %0 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %src) @@ -241,23 +234,22 @@ entry: define arm_aapcs_vfpcc <8 x half> @fnearbyint_float16_t(<8 x half> %src) { ; CHECK-LABEL: fnearbyint_float16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vrintr.f16 s8, s0 -; CHECK-NEXT: vrintr.f16 s0, s4 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vrintr.f16 s8, s8 -; CHECK-NEXT: vrintr.f16 s1, s5 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vrintr.f16 s8, s8 -; CHECK-NEXT: vrintr.f16 s2, s6 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vrintr.f16 s8, s8 -; CHECK-NEXT: vrintr.f16 s3, s7 -; CHECK-NEXT: vins.f16 s3, s8 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vrintr.f16 s0, s0 +; CHECK-NEXT: vrintr.f16 s4, s4 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vrintr.f16 s4, s4 +; CHECK-NEXT: vrintr.f16 s1, s1 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vrintr.f16 s4, s4 +; CHECK-NEXT: vrintr.f16 s2, s2 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vrintr.f16 s4, s4 +; CHECK-NEXT: vrintr.f16 s3, s3 +; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: %0 = call fast <8 x half> @llvm.nearbyint.v8f16(<8 x half> %src) @@ -291,11 +283,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @ffloor_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: ffloor_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vrintm.f32 s7, s3 -; CHECK-MVE-NEXT: vrintm.f32 s6, s2 -; CHECK-MVE-NEXT: vrintm.f32 s5, s1 -; CHECK-MVE-NEXT: vrintm.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vrintm.f32 s3, s3 +; CHECK-MVE-NEXT: vrintm.f32 s2, s2 +; CHECK-MVE-NEXT: vrintm.f32 s1, s1 +; CHECK-MVE-NEXT: vrintm.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: ffloor_float32_t: @@ -310,23 +301,22 @@ entry: define arm_aapcs_vfpcc <8 x half> @ffloor_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: ffloor_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vrintm.f16 s8, s0 -; CHECK-MVE-NEXT: vrintm.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vrintm.f16 s8, s8 -; CHECK-MVE-NEXT: vrintm.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vrintm.f16 s8, s8 -; CHECK-MVE-NEXT: vrintm.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vrintm.f16 s8, s8 -; CHECK-MVE-NEXT: vrintm.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vrintm.f16 s0, s0 +; CHECK-MVE-NEXT: vrintm.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vrintm.f16 s4, s4 +; CHECK-MVE-NEXT: vrintm.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vrintm.f16 s4, s4 +; CHECK-MVE-NEXT: vrintm.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vrintm.f16 s4, s4 +; CHECK-MVE-NEXT: vrintm.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: ffloor_float16_t: @@ -365,11 +355,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @fround_float32_t(<4 x float> %src) { ; CHECK-MVE-LABEL: fround_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vrinta.f32 s7, s3 -; CHECK-MVE-NEXT: vrinta.f32 s6, s2 -; CHECK-MVE-NEXT: vrinta.f32 s5, s1 -; CHECK-MVE-NEXT: vrinta.f32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vrinta.f32 s3, s3 +; CHECK-MVE-NEXT: vrinta.f32 s2, s2 +; CHECK-MVE-NEXT: vrinta.f32 s1, s1 +; CHECK-MVE-NEXT: vrinta.f32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fround_float32_t: @@ -384,23 +373,22 @@ entry: define arm_aapcs_vfpcc <8 x half> @fround_float16_t(<8 x half> %src) { ; CHECK-MVE-LABEL: fround_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q1, q0 -; CHECK-MVE-NEXT: vmovx.f16 s0, s4 -; CHECK-MVE-NEXT: vrinta.f16 s8, s0 -; CHECK-MVE-NEXT: vrinta.f16 s0, s4 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s5 -; CHECK-MVE-NEXT: vrinta.f16 s8, s8 -; CHECK-MVE-NEXT: vrinta.f16 s1, s5 -; CHECK-MVE-NEXT: vins.f16 s1, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s6 -; CHECK-MVE-NEXT: vrinta.f16 s8, s8 -; CHECK-MVE-NEXT: vrinta.f16 s2, s6 -; CHECK-MVE-NEXT: vins.f16 s2, s8 -; CHECK-MVE-NEXT: vmovx.f16 s8, s7 -; CHECK-MVE-NEXT: vrinta.f16 s8, s8 -; CHECK-MVE-NEXT: vrinta.f16 s3, s7 -; CHECK-MVE-NEXT: vins.f16 s3, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s0 +; CHECK-MVE-NEXT: vrinta.f16 s0, s0 +; CHECK-MVE-NEXT: vrinta.f16 s4, s4 +; CHECK-MVE-NEXT: vins.f16 s0, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vrinta.f16 s4, s4 +; CHECK-MVE-NEXT: vrinta.f16 s1, s1 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vrinta.f16 s4, s4 +; CHECK-MVE-NEXT: vrinta.f16 s2, s2 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vrinta.f16 s4, s4 +; CHECK-MVE-NEXT: vrinta.f16 s3, s3 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: fround_float16_t: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll index 37e4122ac012c..ac1c0d03c85b5 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -89,23 +89,23 @@ define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_sext(i16* %base, <8 x i16>* %off ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vldr.16 s8, [r3] -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r2, r3, d3 -; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vldr.16 s4, [r3] -; CHECK-NEXT: vldr.16 s1, [r2] -; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vins.f16 s0, s4 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vldr.16 s2, [r3] +; CHECK-NEXT: vldr.16 s1, [r2] ; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vins.f16 s1, s2 ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vldr.16 s8, [r1] +; CHECK-NEXT: vldr.16 s4, [r1] ; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vins.f16 s2, s4 ; CHECK-NEXT: vldr.16 s4, [r1] ; CHECK-NEXT: vldr.16 s3, [r0] ; CHECK-NEXT: vins.f16 s3, s4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll index 816969209ff8d..654e7eea28a1c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -172,10 +172,10 @@ define arm_aapcs_vfpcc <8 x float> @ptr_v8f32(<8 x float*>* %offptr) { ; CHECK-NEXT: vldr s3, [r2] ; CHECK-NEXT: vldr s2, [r12] ; CHECK-NEXT: vldr s1, [r1] +; CHECK-NEXT: vldr s0, [lr] ; CHECK-NEXT: vldr s7, [r3] ; CHECK-NEXT: vldr s6, [r0] ; CHECK-NEXT: vldr s5, [r5] -; CHECK-NEXT: vldr s0, [lr] ; CHECK-NEXT: vldr s4, [r4] ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -413,23 +413,23 @@ entry: define arm_aapcs_vfpcc <8 x half> @ptr_f16(<8 x half*>* %offptr) { ; CHECK-LABEL: ptr_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vldr.16 s8, [r2] -; CHECK-NEXT: vldr.16 s0, [r1] -; CHECK-NEXT: vmov r1, r2, d3 -; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov r1, r2, d0 ; CHECK-NEXT: vldr.16 s4, [r2] -; CHECK-NEXT: vldr.16 s1, [r1] -; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vins.f16 s0, s4 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldr.16 s1, [r1] +; CHECK-NEXT: vldr.16 s2, [r2] ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vldr.16 s8, [r1] +; CHECK-NEXT: vins.f16 s1, s2 +; CHECK-NEXT: vldr.16 s4, [r1] ; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vldr.16 s4, [r1] ; CHECK-NEXT: vldr.16 s3, [r0] +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vldr.16 s4, [r1] ; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: @@ -441,15 +441,15 @@ entry: define arm_aapcs_vfpcc <4 x half> @ptr_v4f16(<4 x half*>* %offptr) { ; CHECK-LABEL: ptr_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vldr.16 s8, [r1] -; CHECK-NEXT: vldr.16 s0, [r0] -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vldr.16 s4, [r1] +; CHECK-NEXT: vldr.16 s0, [r0] +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: vldr.16 s2, [r1] ; CHECK-NEXT: vldr.16 s1, [r0] -; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vins.f16 s1, s2 ; CHECK-NEXT: bx lr entry: %offs = load <4 x half*>, <4 x half*>* %offptr, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll index 9d4d261a82709..917cec927a993 100644 --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll @@ -81,8 +81,6 @@ declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.mve.vld4q.v16i8 define arm_aapcs_vfpcc void @test_vst2q_u32(i32* %addr, %struct.uint32x4x2_t %value.coerce) { ; CHECK-LABEL: test_vst2q_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: vst20.32 {q0, q1}, [r0] ; CHECK-NEXT: vst21.32 {q0, q1}, [r0] ; CHECK-NEXT: bx lr @@ -97,8 +95,6 @@ entry: define arm_aapcs_vfpcc i32* @test_vst2q_u32_post(i32* %addr, %struct.uint32x4x2_t %value.coerce) { ; CHECK-LABEL: test_vst2q_u32_post: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: vst20.32 {q0, q1}, [r0] ; CHECK-NEXT: vst21.32 {q0, q1}, [r0]! ; CHECK-NEXT: bx lr @@ -116,8 +112,6 @@ declare void @llvm.arm.mve.vst2q.p0i32.v4i32(i32*, <4 x i32>, <4 x i32>, i32) define arm_aapcs_vfpcc void @test_vst2q_f16(half* %addr, %struct.float16x8x2_t %value.coerce) { ; CHECK-LABEL: test_vst2q_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: vst20.16 {q0, q1}, [r0] ; CHECK-NEXT: vst21.16 {q0, q1}, [r0] ; CHECK-NEXT: bx lr @@ -132,8 +126,6 @@ entry: define arm_aapcs_vfpcc half* @test_vst2q_f16_post(half* %addr, %struct.float16x8x2_t %value.coerce) { ; CHECK-LABEL: test_vst2q_f16_post: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: vst20.16 {q0, q1}, [r0] ; CHECK-NEXT: vst21.16 {q0, q1}, [r0]! ; CHECK-NEXT: bx lr @@ -151,10 +143,6 @@ declare void @llvm.arm.mve.vst2q.p0f16.v8f16(half*, <8 x half>, <8 x half>, i32) define arm_aapcs_vfpcc void @test_vst4q_s8(i8* %addr, %struct.int8x16x4_t %value.coerce) { ; CHECK-LABEL: test_vst4q_s8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r0] @@ -175,10 +163,6 @@ entry: define arm_aapcs_vfpcc i8* @test_vst4q_s8_post(i8* %addr, %struct.int8x16x4_t %value.coerce) { ; CHECK-LABEL: test_vst4q_s8_post: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 ; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll index bf601d71761cc..120105cfd14c7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll @@ -6,55 +6,55 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d9} +; CHECK-NEXT: vpush {d9} ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vldrw.u32 q5, [r2] ; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.f32 s10, s7 ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: vand q3, q2, q0 ; CHECK-NEXT: vand q0, q1, q0 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r4, r1, d4 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov.f32 s16, s22 -; CHECK-NEXT: vmov.f32 s18, s23 ; CHECK-NEXT: vmov r3, lr, d0 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r0, r12, d5 -; CHECK-NEXT: vmov.f32 s8, s20 -; CHECK-NEXT: vmov.f32 s10, s21 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov r4, r1, d6 +; CHECK-NEXT: vmov r0, r12, d7 +; CHECK-NEXT: vldrw.u32 q3, [r2] +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov.f32 s0, s12 +; CHECK-NEXT: vmov.f32 s6, s13 ; CHECK-NEXT: adds r2, r5, r4 -; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r4, s8 ; CHECK-NEXT: asr.w r6, r5, #31 ; CHECK-NEXT: adcs r1, r6 ; CHECK-NEXT: asrl r2, r1, r4 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: adds r6, r1, r3 -; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: asr.w r4, r1, #31 ; CHECK-NEXT: adc.w r1, r4, lr ; CHECK-NEXT: asrl r6, r1, r3 ; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 ; CHECK-NEXT: adds r0, r0, r1 ; CHECK-NEXT: asr.w r3, r1, #31 ; CHECK-NEXT: adc.w r1, r3, r12 ; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: asrl r0, r1, r3 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: adds r6, r1, r5 ; CHECK-NEXT: asr.w r2, r1, #31 ; CHECK-NEXT: adc.w r1, r2, r4 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: asrl r6, r1, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r6, r0 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d9} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %a = load <4 x i32>, <4 x i32> *%A, align 4 @@ -142,30 +142,30 @@ define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: .vsave {d9} +; CHECK-NEXT: vpush {d9} +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vldrw.u32 q5, [r2] -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vmov.f32 s10, s9 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vand q3, q2, q0 ; CHECK-NEXT: vand q1, q1, q0 -; CHECK-NEXT: vand q2, q2, q0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r5, r1, d2 -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov.f32 s16, s22 -; CHECK-NEXT: vmov.f32 s18, s23 -; CHECK-NEXT: vmov r4, lr, d4 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r0, r12, d3 -; CHECK-NEXT: vmov.f32 s4, s20 -; CHECK-NEXT: vmov.f32 s6, s21 +; CHECK-NEXT: vmov r4, lr, d2 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov r5, r1, d6 +; CHECK-NEXT: vmov r0, r12, d7 +; CHECK-NEXT: vldrw.u32 q3, [r2] +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s14, s1 +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vmov.f32 s4, s12 +; CHECK-NEXT: vmov.f32 s2, s13 ; CHECK-NEXT: adds r2, r6, r5 -; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r5, s8 ; CHECK-NEXT: asr.w r7, r6, #31 ; CHECK-NEXT: adcs r1, r7 ; CHECK-NEXT: asrl r2, r1, r5 @@ -175,23 +175,23 @@ define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i ; CHECK-NEXT: asr.w r5, r1, #31 ; CHECK-NEXT: adc.w r1, r5, lr ; CHECK-NEXT: asrl r4, r1, r7 -; CHECK-NEXT: vmov r6, r5, d5 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 +; CHECK-NEXT: vmov r6, r5, d3 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov q1[2], q1[0], r4, r2 ; CHECK-NEXT: adds r0, r0, r1 ; CHECK-NEXT: asr.w r7, r1, #31 ; CHECK-NEXT: adc.w r1, r7, r12 ; CHECK-NEXT: vmov r7, s18 ; CHECK-NEXT: asrl r0, r1, r7 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: adds r6, r6, r1 ; CHECK-NEXT: asr.w r2, r1, #31 ; CHECK-NEXT: adc.w r1, r2, r5 -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: asrl r6, r1, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r6, r0 -; CHECK-NEXT: vstrw.32 q2, [r3] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vmov q1[3], q1[1], r6, r0 +; CHECK-NEXT: vstrw.32 q1, [r3] +; CHECK-NEXT: vpop {d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: @@ -280,9 +280,9 @@ define arm_aapcs_vfpcc void @load_one_store_i32(<4 x i32> *%A, <4 x i32> *%D) { ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: adds.w r12, r2, r2 ; CHECK-NEXT: asr.w r3, r2, #31 ; CHECK-NEXT: adc.w r7, r3, r2, asr #31 @@ -370,26 +370,24 @@ define arm_aapcs_vfpcc void @mul_i32(<4 x i32> *%A, <4 x i32> *%B, i64 %C, <4 x ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: ldr.w lr, [sp, #20] -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov.f32 s12, s4 -; CHECK-NEXT: vmov.f32 s14, s5 ; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmov.f32 s6, s7 ; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: smull r12, r3, r1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmullb.s32 q2, q1, q0 ; CHECK-NEXT: asrl r12, r3, r2 -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmullb.s32 q1, q0, q2 -; CHECK-NEXT: vmov r6, r1, d2 -; CHECK-NEXT: vmov r4, r7, d3 +; CHECK-NEXT: vmov r6, r1, d4 +; CHECK-NEXT: vmov r4, r7, d5 ; CHECK-NEXT: asrl r6, r1, r2 ; CHECK-NEXT: asrl r4, r7, r2 ; CHECK-NEXT: smull r0, r5, r5, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll index 44fd3e621969c..655a67bad734e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -65,20 +65,20 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_trunc_i32(<4 x i32> %a, <4 x i32> %b) ; CHECK-LABEL: ext_add_trunc_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s6, s7 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: add.w r12, r1, r0 -; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: add r1, r2 -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: add r0, r3 @@ -184,17 +184,17 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r0, r1, d6 ; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov.f32 s10, s1 ; CHECK-NEXT: vmov r12, lr, d7 +; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: adds r0, r0, r4 ; CHECK-NEXT: asr.w r5, r4, #31 ; CHECK-NEXT: adcs r1, r5 @@ -205,9 +205,9 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> ; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: lsrl r2, r3, #1 ; CHECK-NEXT: vmov r1, r5, d3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: adds.w r4, r3, r12 ; CHECK-NEXT: asr.w r6, r3, #31 ; CHECK-NEXT: adc.w r3, r6, lr @@ -216,8 +216,7 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> ; CHECK-NEXT: adc.w r1, r2, r5 ; CHECK-NEXT: lsrl r4, r3, #1 ; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r4 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r4 ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %sa = sext <4 x i32> %a to <4 x i64> @@ -346,11 +345,11 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vand q1, q1, q3 ; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vmov r12, r2, d5 ; CHECK-NEXT: vmov r8, r9, d3 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: vmov lr, s2 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: adds.w r4, r1, r12 @@ -359,21 +358,21 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) ; CHECK-NEXT: asrl r4, r5, r12 ; CHECK-NEXT: subs.w r0, r4, r12 ; CHECK-NEXT: sbc.w r2, r5, r2 -; CHECK-NEXT: asr.w r5, lr, #31 ; CHECK-NEXT: umull r0, r4, r0, r12 ; CHECK-NEXT: adds.w r6, lr, r8 +; CHECK-NEXT: mla r3, r2, r12, r4 +; CHECK-NEXT: asr.w r5, lr, #31 ; CHECK-NEXT: adc.w r5, r5, r9 +; CHECK-NEXT: rsbs r2, r1, #0 ; CHECK-NEXT: asrl r6, r5, r8 -; CHECK-NEXT: mla r3, r2, r12, r4 +; CHECK-NEXT: lsll r0, r3, r2 ; CHECK-NEXT: subs.w r7, r6, r8 +; CHECK-NEXT: vmov r6, r2, d4 ; CHECK-NEXT: sbc.w r10, r5, r9 -; CHECK-NEXT: rsbs r2, r1, #0 ; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: lsll r0, r3, r2 -; CHECK-NEXT: vmov r6, r2, d4 ; CHECK-NEXT: lsll r0, r3, r12 -; CHECK-NEXT: asrs r3, r5, #31 ; CHECK-NEXT: adds r4, r5, r6 +; CHECK-NEXT: asr.w r3, r5, #31 ; CHECK-NEXT: adcs r3, r2 ; CHECK-NEXT: asrl r4, r3, r6 ; CHECK-NEXT: subs r4, r4, r6 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll index 2abf5ef19addd..f65ad3e7de22b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -851,15 +851,15 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *% ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: bpl .LBB18_5 ; CHECK-LE-NEXT: .LBB18_4: @ %cond.load7 -; CHECK-LE-NEXT: vldr.16 s4, [r2, #6] -; CHECK-LE-NEXT: vins.f16 s1, s4 +; CHECK-LE-NEXT: vldr.16 s2, [r2, #6] +; CHECK-LE-NEXT: vins.f16 s1, s2 ; CHECK-LE-NEXT: .LBB18_5: @ %else8 ; CHECK-LE-NEXT: vmrs r2, p0 ; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-LE-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-LE-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-LE-NEXT: vcvtb.f32.f16 s4, s0 +; CHECK-LE-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-LE-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-LE-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-LE-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-LE-NEXT: and r3, r2, #1 ; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 @@ -874,19 +874,19 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *% ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: vmovne r2, s4 +; CHECK-LE-NEXT: vmovne r2, s0 ; CHECK-LE-NEXT: strne r2, [r0] ; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r2, s5 +; CHECK-LE-NEXT: vmovmi r2, s1 ; CHECK-LE-NEXT: strmi r2, [r0, #4] ; CHECK-LE-NEXT: lsls r2, r1, #29 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r2, s6 +; CHECK-LE-NEXT: vmovmi r2, s2 ; CHECK-LE-NEXT: strmi r2, [r0, #8] ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r1, s7 +; CHECK-LE-NEXT: vmovmi r1, s3 ; CHECK-LE-NEXT: strmi r1, [r0, #12] ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r7, pc} @@ -895,14 +895,14 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *% ; CHECK-LE-NEXT: lsls r3, r1, #30 ; CHECK-LE-NEXT: bpl .LBB18_2 ; CHECK-LE-NEXT: .LBB18_7: @ %cond.load1 -; CHECK-LE-NEXT: vldr.16 s4, [r2, #2] -; CHECK-LE-NEXT: vins.f16 s0, s4 +; CHECK-LE-NEXT: vldr.16 s2, [r2, #2] +; CHECK-LE-NEXT: vins.f16 s0, s2 ; CHECK-LE-NEXT: lsls r3, r1, #29 ; CHECK-LE-NEXT: bpl .LBB18_3 ; CHECK-LE-NEXT: .LBB18_8: @ %cond.load4 -; CHECK-LE-NEXT: vmovx.f16 s4, s1 ; CHECK-LE-NEXT: vldr.16 s1, [r2, #4] -; CHECK-LE-NEXT: vins.f16 s1, s4 +; CHECK-LE-NEXT: vmovx.f16 s2, s0 +; CHECK-LE-NEXT: vins.f16 s1, s2 ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: bmi .LBB18_4 ; CHECK-LE-NEXT: b .LBB18_5 @@ -942,15 +942,15 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *% ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: beq .LBB18_5 ; CHECK-BE-NEXT: .LBB18_4: @ %cond.load7 -; CHECK-BE-NEXT: vldr.16 s4, [r2, #6] -; CHECK-BE-NEXT: vins.f16 s1, s4 +; CHECK-BE-NEXT: vldr.16 s2, [r2, #6] +; CHECK-BE-NEXT: vins.f16 s1, s2 ; CHECK-BE-NEXT: .LBB18_5: @ %else8 ; CHECK-BE-NEXT: vmrs r2, p0 ; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-BE-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-BE-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-BE-NEXT: vcvtb.f32.f16 s4, s0 +; CHECK-BE-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-BE-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-BE-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-BE-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-BE-NEXT: ubfx r3, r2, #12, #1 ; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: bfi r1, r3, #0, #1 @@ -965,19 +965,19 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *% ; CHECK-BE-NEXT: bfi r1, r2, #3, #1 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r2, s4 +; CHECK-BE-NEXT: vmovmi r2, s0 ; CHECK-BE-NEXT: strmi r2, [r0] ; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r2, s5 +; CHECK-BE-NEXT: vmovmi r2, s1 ; CHECK-BE-NEXT: strmi r2, [r0, #4] ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r2, s6 +; CHECK-BE-NEXT: vmovmi r2, s2 ; CHECK-BE-NEXT: strmi r2, [r0, #8] ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: itt ne -; CHECK-BE-NEXT: vmovne r1, s7 +; CHECK-BE-NEXT: vmovne r1, s3 ; CHECK-BE-NEXT: strne r1, [r0, #12] ; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: pop {r7, pc} @@ -986,14 +986,14 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *% ; CHECK-BE-NEXT: lsls r3, r1, #29 ; CHECK-BE-NEXT: bpl .LBB18_2 ; CHECK-BE-NEXT: .LBB18_7: @ %cond.load1 -; CHECK-BE-NEXT: vldr.16 s4, [r2, #2] -; CHECK-BE-NEXT: vins.f16 s0, s4 +; CHECK-BE-NEXT: vldr.16 s2, [r2, #2] +; CHECK-BE-NEXT: vins.f16 s0, s2 ; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB18_3 ; CHECK-BE-NEXT: .LBB18_8: @ %cond.load4 -; CHECK-BE-NEXT: vmovx.f16 s4, s1 ; CHECK-BE-NEXT: vldr.16 s1, [r2, #4] -; CHECK-BE-NEXT: vins.f16 s1, s4 +; CHECK-BE-NEXT: vmovx.f16 s2, s0 +; CHECK-BE-NEXT: vins.f16 s1, s2 ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: bne .LBB18_4 ; CHECK-BE-NEXT: b .LBB18_5 @@ -1042,15 +1042,15 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4 ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: bpl .LBB19_5 ; CHECK-LE-NEXT: .LBB19_4: @ %cond.load7 -; CHECK-LE-NEXT: vldr.16 s4, [r2, #6] -; CHECK-LE-NEXT: vins.f16 s1, s4 +; CHECK-LE-NEXT: vldr.16 s2, [r2, #6] +; CHECK-LE-NEXT: vins.f16 s1, s2 ; CHECK-LE-NEXT: .LBB19_5: @ %else8 ; CHECK-LE-NEXT: vmrs r2, p0 ; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-LE-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-LE-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-LE-NEXT: vcvtb.f32.f16 s4, s0 +; CHECK-LE-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-LE-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-LE-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-LE-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-LE-NEXT: and r3, r2, #1 ; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 @@ -1065,19 +1065,19 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: vmovne r2, s4 +; CHECK-LE-NEXT: vmovne r2, s0 ; CHECK-LE-NEXT: strne r2, [r0] ; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r2, s5 +; CHECK-LE-NEXT: vmovmi r2, s1 ; CHECK-LE-NEXT: strmi r2, [r0, #4] ; CHECK-LE-NEXT: lsls r2, r1, #29 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r2, s6 +; CHECK-LE-NEXT: vmovmi r2, s2 ; CHECK-LE-NEXT: strmi r2, [r0, #8] ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r1, s7 +; CHECK-LE-NEXT: vmovmi r1, s3 ; CHECK-LE-NEXT: strmi r1, [r0, #12] ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r7, pc} @@ -1086,14 +1086,14 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4 ; CHECK-LE-NEXT: lsls r3, r1, #30 ; CHECK-LE-NEXT: bpl .LBB19_2 ; CHECK-LE-NEXT: .LBB19_7: @ %cond.load1 -; CHECK-LE-NEXT: vldr.16 s4, [r2, #2] -; CHECK-LE-NEXT: vins.f16 s0, s4 +; CHECK-LE-NEXT: vldr.16 s2, [r2, #2] +; CHECK-LE-NEXT: vins.f16 s0, s2 ; CHECK-LE-NEXT: lsls r3, r1, #29 ; CHECK-LE-NEXT: bpl .LBB19_3 ; CHECK-LE-NEXT: .LBB19_8: @ %cond.load4 -; CHECK-LE-NEXT: vmovx.f16 s4, s1 ; CHECK-LE-NEXT: vldr.16 s1, [r2, #4] -; CHECK-LE-NEXT: vins.f16 s1, s4 +; CHECK-LE-NEXT: vmovx.f16 s2, s0 +; CHECK-LE-NEXT: vins.f16 s1, s2 ; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: bmi .LBB19_4 ; CHECK-LE-NEXT: b .LBB19_5 @@ -1133,15 +1133,15 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4 ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: beq .LBB19_5 ; CHECK-BE-NEXT: .LBB19_4: @ %cond.load7 -; CHECK-BE-NEXT: vldr.16 s4, [r2, #6] -; CHECK-BE-NEXT: vins.f16 s1, s4 +; CHECK-BE-NEXT: vldr.16 s2, [r2, #6] +; CHECK-BE-NEXT: vins.f16 s1, s2 ; CHECK-BE-NEXT: .LBB19_5: @ %else8 ; CHECK-BE-NEXT: vmrs r2, p0 ; CHECK-BE-NEXT: movs r1, #0 -; CHECK-BE-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-BE-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-BE-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-BE-NEXT: vcvtb.f32.f16 s4, s0 +; CHECK-BE-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-BE-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-BE-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-BE-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-BE-NEXT: ubfx r3, r2, #12, #1 ; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: bfi r1, r3, #0, #1 @@ -1156,19 +1156,19 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4 ; CHECK-BE-NEXT: bfi r1, r2, #3, #1 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r2, s4 +; CHECK-BE-NEXT: vmovmi r2, s0 ; CHECK-BE-NEXT: strmi r2, [r0] ; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r2, s5 +; CHECK-BE-NEXT: vmovmi r2, s1 ; CHECK-BE-NEXT: strmi r2, [r0, #4] ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r2, s6 +; CHECK-BE-NEXT: vmovmi r2, s2 ; CHECK-BE-NEXT: strmi r2, [r0, #8] ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: itt ne -; CHECK-BE-NEXT: vmovne r1, s7 +; CHECK-BE-NEXT: vmovne r1, s3 ; CHECK-BE-NEXT: strne r1, [r0, #12] ; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: pop {r7, pc} @@ -1177,14 +1177,14 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4 ; CHECK-BE-NEXT: lsls r3, r1, #29 ; CHECK-BE-NEXT: bpl .LBB19_2 ; CHECK-BE-NEXT: .LBB19_7: @ %cond.load1 -; CHECK-BE-NEXT: vldr.16 s4, [r2, #2] -; CHECK-BE-NEXT: vins.f16 s0, s4 +; CHECK-BE-NEXT: vldr.16 s2, [r2, #2] +; CHECK-BE-NEXT: vins.f16 s0, s2 ; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB19_3 ; CHECK-BE-NEXT: .LBB19_8: @ %cond.load4 -; CHECK-BE-NEXT: vmovx.f16 s4, s1 ; CHECK-BE-NEXT: vldr.16 s1, [r2, #4] -; CHECK-BE-NEXT: vins.f16 s1, s4 +; CHECK-BE-NEXT: vmovx.f16 s2, s0 +; CHECK-BE-NEXT: vins.f16 s1, s2 ; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: bne .LBB19_4 ; CHECK-BE-NEXT: b .LBB19_5 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll index c6c989b2ff85e..8c30520d02cd4 100644 --- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -108,8 +108,8 @@ define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4i32_pre: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr ; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr @@ -117,8 +117,8 @@ define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-BE-LABEL: masked_v4i32_pre: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr ; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4]! @@ -137,8 +137,8 @@ define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4i32_post: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr ; CHECK-LE-NEXT: vstrwt.32 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr @@ -146,8 +146,8 @@ define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-BE-LABEL: masked_v4i32_post: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr ; CHECK-BE-NEXT: vstrwt.32 q1, [r0], #4 @@ -327,8 +327,8 @@ define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8i16_pre: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr ; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr @@ -336,8 +336,8 @@ define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-BE-LABEL: masked_v8i16_pre: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vrev64.16 q2, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr ; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4]! @@ -356,8 +356,8 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8i16_post: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr ; CHECK-LE-NEXT: vstrht.16 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr @@ -365,8 +365,8 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-BE-LABEL: masked_v8i16_post: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vrev64.16 q2, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr ; CHECK-BE-NEXT: vstrht.16 q1, [r0], #4 @@ -405,8 +405,8 @@ define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) { ; CHECK-LE-LABEL: masked_v16i8_pre: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s8 gt, q0, zr ; CHECK-LE-NEXT: vstrbt.8 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr @@ -414,8 +414,8 @@ define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) { ; CHECK-BE-LABEL: masked_v16i8_pre: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrb.u8 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrb.u8 q1, [r1] ; CHECK-BE-NEXT: vrev64.8 q2, q0 ; CHECK-BE-NEXT: vpt.s8 gt, q2, zr ; CHECK-BE-NEXT: vstrbt.8 q1, [r0, #4]! @@ -434,8 +434,8 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) { ; CHECK-LE-LABEL: masked_v16i8_post: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s8 gt, q0, zr ; CHECK-LE-NEXT: vstrbt.8 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr @@ -443,8 +443,8 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) { ; CHECK-BE-LABEL: masked_v16i8_post: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrb.u8 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrb.u8 q1, [r1] ; CHECK-BE-NEXT: vrev64.8 q2, q0 ; CHECK-BE-NEXT: vpt.s8 gt, q2, zr ; CHECK-BE-NEXT: vstrbt.8 q1, [r0], #4 @@ -568,8 +568,8 @@ define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4f32_pre: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr ; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr @@ -577,8 +577,8 @@ define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-BE-LABEL: masked_v4f32_pre: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr ; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4]! @@ -597,8 +597,8 @@ define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-LE-LABEL: masked_v4f32_post: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr ; CHECK-LE-NEXT: vstrwt.32 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr @@ -606,8 +606,8 @@ define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-BE-LABEL: masked_v4f32_post: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrw.u32 q1, [r1] ; CHECK-BE-NEXT: vrev64.32 q2, q0 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr ; CHECK-BE-NEXT: vstrwt.32 q1, [r0], #4 @@ -709,8 +709,8 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> % ; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: bpl .LBB16_2 ; CHECK-LE-NEXT: .LBB16_10: @ %cond.store1 -; CHECK-LE-NEXT: vmovx.f16 s4, s0 -; CHECK-LE-NEXT: vstr.16 s4, [sp, #24] +; CHECK-LE-NEXT: vmovx.f16 s0, s0 +; CHECK-LE-NEXT: vstr.16 s0, [sp, #24] ; CHECK-LE-NEXT: ldrh.w r2, [sp, #24] ; CHECK-LE-NEXT: strh r2, [r0, #2] ; CHECK-LE-NEXT: lsls r2, r1, #29 @@ -722,8 +722,8 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> % ; CHECK-LE-NEXT: lsls r2, r1, #28 ; CHECK-LE-NEXT: bpl .LBB16_4 ; CHECK-LE-NEXT: .LBB16_12: @ %cond.store5 -; CHECK-LE-NEXT: vmovx.f16 s4, s1 -; CHECK-LE-NEXT: vstr.16 s4, [sp, #16] +; CHECK-LE-NEXT: vmovx.f16 s0, s1 +; CHECK-LE-NEXT: vstr.16 s0, [sp, #16] ; CHECK-LE-NEXT: ldrh.w r2, [sp, #16] ; CHECK-LE-NEXT: strh r2, [r0, #6] ; CHECK-LE-NEXT: lsls r2, r1, #27 @@ -735,8 +735,8 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> % ; CHECK-LE-NEXT: lsls r2, r1, #26 ; CHECK-LE-NEXT: bpl .LBB16_6 ; CHECK-LE-NEXT: .LBB16_14: @ %cond.store9 -; CHECK-LE-NEXT: vmovx.f16 s4, s2 -; CHECK-LE-NEXT: vstr.16 s4, [sp, #8] +; CHECK-LE-NEXT: vmovx.f16 s0, s2 +; CHECK-LE-NEXT: vstr.16 s0, [sp, #8] ; CHECK-LE-NEXT: ldrh.w r2, [sp, #8] ; CHECK-LE-NEXT: strh r2, [r0, #10] ; CHECK-LE-NEXT: lsls r2, r1, #25 @@ -877,8 +877,8 @@ define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8f16_pre: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr ; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4]! ; CHECK-LE-NEXT: bx lr @@ -886,8 +886,8 @@ define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-BE-LABEL: masked_v8f16_pre: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vrev64.16 q2, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr ; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4]! @@ -906,8 +906,8 @@ define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8f16_post: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vldr d1, [sp] -; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vmov d0, r2, r3 +; CHECK-LE-NEXT: vldrw.u32 q1, [r1] ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr ; CHECK-LE-NEXT: vstrht.16 q1, [r0], #4 ; CHECK-LE-NEXT: bx lr @@ -915,8 +915,8 @@ define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-BE-LABEL: masked_v8f16_post: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vldr d1, [sp] -; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vmov d0, r3, r2 +; CHECK-BE-NEXT: vldrh.u16 q1, [r1] ; CHECK-BE-NEXT: vrev64.16 q2, q0 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr ; CHECK-BE-NEXT: vstrht.16 q1, [r0], #4 @@ -1253,12 +1253,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align4(<4 x half> *%dest, <4 x float> ; CHECK-LE-NEXT: it gt ; CHECK-LE-NEXT: movgt r2, #1 ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 ; CHECK-LE-NEXT: csetm r2, ne -; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 +; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 +; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 +; CHECK-LE-NEXT: vcvtb.f16.f32 s6, s2 ; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: bne .LBB25_5 @@ -1328,12 +1328,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align4(<4 x half> *%dest, <4 x float> ; CHECK-BE-NEXT: it gt ; CHECK-BE-NEXT: movgt r2, #1 ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: bfi r1, r3, #2, #1 ; CHECK-BE-NEXT: csetm r2, ne -; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 +; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: bfi r1, r2, #3, #1 -; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6 +; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 +; CHECK-BE-NEXT: vcvtb.f16.f32 s2, s6 ; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: bmi .LBB25_5 @@ -1354,8 +1354,8 @@ define arm_aapcs_vfpcc void @masked_v4f16_align4(<4 x half> *%dest, <4 x float> ; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: bpl .LBB25_2 ; CHECK-BE-NEXT: .LBB25_6: @ %cond.store1 -; CHECK-BE-NEXT: vmovx.f16 s4, s0 -; CHECK-BE-NEXT: vstr.16 s4, [r0, #2] +; CHECK-BE-NEXT: vmovx.f16 s0, s0 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #2] ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: bpl .LBB25_3 ; CHECK-BE-NEXT: .LBB25_7: @ %cond.store3 @@ -1409,12 +1409,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align2(<4 x half> *%dest, <4 x float> ; CHECK-LE-NEXT: it gt ; CHECK-LE-NEXT: movgt r2, #1 ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 ; CHECK-LE-NEXT: csetm r2, ne -; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 +; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 +; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 +; CHECK-LE-NEXT: vcvtb.f16.f32 s6, s2 ; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: bne .LBB26_5 @@ -1484,12 +1484,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align2(<4 x half> *%dest, <4 x float> ; CHECK-BE-NEXT: it gt ; CHECK-BE-NEXT: movgt r2, #1 ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: bfi r1, r3, #2, #1 ; CHECK-BE-NEXT: csetm r2, ne -; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 +; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: bfi r1, r2, #3, #1 -; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6 +; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 +; CHECK-BE-NEXT: vcvtb.f16.f32 s2, s6 ; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: bmi .LBB26_5 @@ -1510,8 +1510,8 @@ define arm_aapcs_vfpcc void @masked_v4f16_align2(<4 x half> *%dest, <4 x float> ; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: bpl .LBB26_2 ; CHECK-BE-NEXT: .LBB26_6: @ %cond.store1 -; CHECK-BE-NEXT: vmovx.f16 s4, s0 -; CHECK-BE-NEXT: vstr.16 s4, [r0, #2] +; CHECK-BE-NEXT: vmovx.f16 s0, s0 +; CHECK-BE-NEXT: vstr.16 s0, [r0, #2] ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: bpl .LBB26_3 ; CHECK-BE-NEXT: .LBB26_7: @ %cond.store3 @@ -1565,12 +1565,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(<4 x half> *%dest, <4 x float> ; CHECK-LE-NEXT: it gt ; CHECK-LE-NEXT: movgt r2, #1 ; CHECK-LE-NEXT: cmp r2, #0 -; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 ; CHECK-LE-NEXT: csetm r2, ne -; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 +; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 +; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 +; CHECK-LE-NEXT: vcvtb.f16.f32 s6, s2 ; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: bne .LBB27_5 @@ -1648,12 +1648,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(<4 x half> *%dest, <4 x float> ; CHECK-BE-NEXT: it gt ; CHECK-BE-NEXT: movgt r2, #1 ; CHECK-BE-NEXT: cmp r2, #0 -; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: bfi r1, r3, #2, #1 ; CHECK-BE-NEXT: csetm r2, ne -; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 +; CHECK-BE-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-BE-NEXT: bfi r1, r2, #3, #1 -; CHECK-BE-NEXT: vcvtb.f16.f32 s1, s6 +; CHECK-BE-NEXT: vcvtt.f16.f32 s0, s5 +; CHECK-BE-NEXT: vcvtb.f16.f32 s2, s6 ; CHECK-BE-NEXT: vcvtt.f16.f32 s1, s7 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: bmi .LBB27_5 @@ -1676,8 +1676,8 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(<4 x half> *%dest, <4 x float> ; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: bpl .LBB27_2 ; CHECK-BE-NEXT: .LBB27_6: @ %cond.store1 -; CHECK-BE-NEXT: vmovx.f16 s4, s0 -; CHECK-BE-NEXT: vstr.16 s4, [sp, #8] +; CHECK-BE-NEXT: vmovx.f16 s0, s0 +; CHECK-BE-NEXT: vstr.16 s0, [sp, #8] ; CHECK-BE-NEXT: ldrh.w r2, [sp, #8] ; CHECK-BE-NEXT: strh r2, [r0, #2] ; CHECK-BE-NEXT: lsls r2, r1, #30 diff --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll index 6b2939c3a0c1b..912773e2d5131 100644 --- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll @@ -279,11 +279,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @maxnm_float32_t(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: maxnm_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmaxnm.f32 s11, s7, s3 -; CHECK-MVE-NEXT: vmaxnm.f32 s10, s6, s2 -; CHECK-MVE-NEXT: vmaxnm.f32 s9, s5, s1 -; CHECK-MVE-NEXT: vmaxnm.f32 s8, s4, s0 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vmaxnm.f32 s3, s7, s3 +; CHECK-MVE-NEXT: vmaxnm.f32 s2, s6, s2 +; CHECK-MVE-NEXT: vmaxnm.f32 s1, s5, s1 +; CHECK-MVE-NEXT: vmaxnm.f32 s0, s4, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: maxnm_float32_t: @@ -299,27 +298,26 @@ entry: define arm_aapcs_vfpcc <8 x half> @minnm_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-MVE-LABEL: minnm_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vmovx.f16 s2, s4 -; CHECK-MVE-NEXT: vmovx.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vminnm.f16 s12, s2, s0 -; CHECK-MVE-NEXT: vminnm.f16 s0, s4, s8 -; CHECK-MVE-NEXT: vins.f16 s0, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vminnm.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vins.f16 s1, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vminnm.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vins.f16 s2, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vminnm.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vins.f16 s3, s12 +; CHECK-MVE-NEXT: vmovx.f16 s8, s0 +; CHECK-MVE-NEXT: vmovx.f16 s10, s4 +; CHECK-MVE-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-MVE-NEXT: vminnm.f16 s8, s10, s8 +; CHECK-MVE-NEXT: vins.f16 s0, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vminnm.f16 s1, s5, s1 +; CHECK-MVE-NEXT: vminnm.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vminnm.f16 s2, s6, s2 +; CHECK-MVE-NEXT: vminnm.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 +; CHECK-MVE-NEXT: vminnm.f16 s3, s7, s3 +; CHECK-MVE-NEXT: vminnm.f16 s4, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: minnm_float16_t: diff --git a/llvm/test/CodeGen/Thumb2/mve-nofloat.ll b/llvm/test/CodeGen/Thumb2/mve-nofloat.ll index 24eef30656e95..ded3d3141e361 100644 --- a/llvm/test/CodeGen/Thumb2/mve-nofloat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-nofloat.ll @@ -104,20 +104,20 @@ define arm_aapcs_vfpcc <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: .save {r4, r5, r7, lr} ; CHECK-NOFP-NEXT: push {r4, r5, r7, lr} -; CHECK-NOFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NOFP-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NOFP-NEXT: vmov q5, q1 -; CHECK-NOFP-NEXT: vmov q6, q0 -; CHECK-NOFP-NEXT: vmov r4, r0, d13 -; CHECK-NOFP-NEXT: vmov r5, r1, d11 +; CHECK-NOFP-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NOFP-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NOFP-NEXT: vmov q4, q1 +; CHECK-NOFP-NEXT: vmov q5, q0 +; CHECK-NOFP-NEXT: vmov r4, r0, d11 +; CHECK-NOFP-NEXT: vmov r5, r1, d9 ; CHECK-NOFP-NEXT: bl __aeabi_fadd ; CHECK-NOFP-NEXT: vmov s19, r0 ; CHECK-NOFP-NEXT: mov r0, r4 ; CHECK-NOFP-NEXT: mov r1, r5 ; CHECK-NOFP-NEXT: bl __aeabi_fadd ; CHECK-NOFP-NEXT: vmov s18, r0 -; CHECK-NOFP-NEXT: vmov r4, r0, d12 -; CHECK-NOFP-NEXT: vmov r5, r1, d10 +; CHECK-NOFP-NEXT: vmov r4, r0, d10 +; CHECK-NOFP-NEXT: vmov r5, r1, d8 ; CHECK-NOFP-NEXT: bl __aeabi_fadd ; CHECK-NOFP-NEXT: vmov s17, r0 ; CHECK-NOFP-NEXT: mov r0, r4 @@ -125,7 +125,7 @@ define arm_aapcs_vfpcc <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> ; CHECK-NOFP-NEXT: bl __aeabi_fadd ; CHECK-NOFP-NEXT: vmov s16, r0 ; CHECK-NOFP-NEXT: vmov q0, q4 -; CHECK-NOFP-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NOFP-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NOFP-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-FP-LABEL: vector_add_f32: diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll index 8c8667f1762a5..8a1109950c03a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -152,40 +152,39 @@ define dso_local i32 @e() #0 { ; CHECK-NEXT: .pad #408 ; CHECK-NEXT: sub sp, #408 ; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals -; CHECK-NEXT: vldr s12, .LCPI1_0 -; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals ; CHECK-NEXT: vldr s15, .LCPI1_1 -; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: mov r4, r7 -; CHECK-NEXT: ldr r0, [r3, #4]! +; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals ; CHECK-NEXT: movw r2, :lower16:e +; CHECK-NEXT: mov r4, r7 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: ldr r6, [r4, #8]! -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: vmov s13, r3 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: movt r2, :upper16:e +; CHECK-NEXT: ldr r0, [r3, #4]! ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: movt r2, :upper16:e +; CHECK-NEXT: vmov r5, s15 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r4 -; CHECK-NEXT: vmov s21, r2 -; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vmov s13, r3 +; CHECK-NEXT: vldr s12, .LCPI1_0 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r2 -; CHECK-NEXT: vmov.f32 s20, s12 ; CHECK-NEXT: vdup.32 q7, r3 ; CHECK-NEXT: vmov q6[2], q6[0], r3, r5 -; CHECK-NEXT: vmov.f32 s22, s13 ; CHECK-NEXT: vstrw.32 q0, [sp, #92] ; CHECK-NEXT: vmov q0, q7 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vmov q4, q7 ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.32 q7[1], r2 -; CHECK-NEXT: vmov.f32 s23, s15 +; CHECK-NEXT: vmov s21, r2 ; CHECK-NEXT: movs r1, #64 +; CHECK-NEXT: vmov.f32 s20, s12 ; CHECK-NEXT: str r0, [sp, #40] -; CHECK-NEXT: vstrw.32 q5, [r0] +; CHECK-NEXT: vmov.f32 s22, s13 ; CHECK-NEXT: str r6, [r0] -; CHECK-NEXT: vstrw.32 q7, [r0] +; CHECK-NEXT: vmov.f32 s23, s15 ; CHECK-NEXT: str r0, [r0] +; CHECK-NEXT: vstrw.32 q5, [r0] +; CHECK-NEXT: vstrw.32 q7, [r0] ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrw.32 q6, [r0] ; CHECK-NEXT: mov.w r8, #0 @@ -193,6 +192,7 @@ define dso_local i32 @e() #0 { ; CHECK-NEXT: vmov q2[2], q2[0], r3, r3 ; CHECK-NEXT: mov.w r12, #4 ; CHECK-NEXT: vmov q1[3], q1[1], r2, r4 +; CHECK-NEXT: vmov.f32 s14, s13 ; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 ; CHECK-NEXT: vmov.32 q4[0], r8 ; CHECK-NEXT: @ implicit-def: $r2 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll index 17a47e5ec54c7..9f44be17172fb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -34,13 +34,13 @@ define void @DCT_mve1(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: letp lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: vadd.f32 s4, s2, s3 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: add.w r7, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: adds r0, #1 ; CHECK-NEXT: add r3, r9 ; CHECK-NEXT: cmp r0, r12 -; CHECK-NEXT: vadd.f32 s0, s0, s4 +; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: vstr s0, [r7] ; CHECK-NEXT: bne .LBB0_2 ; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup @@ -138,15 +138,15 @@ define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: letp lr, .LBB1_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: vadd.f32 s8, s2, s3 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: add.w r0, r2, r9, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: add r11, r10 -; CHECK-NEXT: vadd.f32 s2, s6, s7 +; CHECK-NEXT: vadd.f32 s6, s6, s7 ; CHECK-NEXT: add r6, r10 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: vadd.f32 s0, s0, s8 -; CHECK-NEXT: vadd.f32 s2, s4, s2 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s2, s4, s6 ; CHECK-NEXT: vstr s0, [r0] ; CHECK-NEXT: add.w r0, r2, r4, lsl #2 ; CHECK-NEXT: adds r4, #2 @@ -279,21 +279,21 @@ define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: letp lr, .LBB2_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1 -; CHECK-NEXT: vadd.f32 s12, s10, s11 +; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: adds r0, r5, #1 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: add r10, r11 -; CHECK-NEXT: vadd.f32 s10, s6, s7 +; CHECK-NEXT: vadd.f32 s6, s6, s7 ; CHECK-NEXT: add.w r0, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: add r12, r11 -; CHECK-NEXT: vadd.f32 s6, s2, s3 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: add r8, r11 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vadd.f32 s2, s8, s12 -; CHECK-NEXT: vadd.f32 s4, s4, s10 -; CHECK-NEXT: vadd.f32 s0, s0, s6 -; CHECK-NEXT: vstr s2, [r0] +; CHECK-NEXT: vadd.f32 s8, s8, s10 +; CHECK-NEXT: vadd.f32 s4, s4, s6 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vstr s8, [r0] ; CHECK-NEXT: add.w r0, r2, r5, lsl #2 ; CHECK-NEXT: vstr s4, [r0] ; CHECK-NEXT: adds r0, r5, #2 @@ -450,22 +450,22 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: letp lr, .LBB3_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 -; CHECK-NEXT: vadd.f32 s16, s14, s15 +; CHECK-NEXT: vadd.f32 s14, s14, s15 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: adds r0, r6, #1 -; CHECK-NEXT: vadd.f32 s14, s10, s11 +; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s10, s6, s7 +; CHECK-NEXT: vadd.f32 s6, s6, s7 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: vadd.f32 s6, s2, s3 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vadd.f32 s2, s12, s16 -; CHECK-NEXT: vadd.f32 s8, s8, s14 -; CHECK-NEXT: vadd.f32 s4, s4, s10 -; CHECK-NEXT: vadd.f32 s0, s0, s6 -; CHECK-NEXT: vstr s2, [r0] +; CHECK-NEXT: vadd.f32 s12, s12, s14 +; CHECK-NEXT: vadd.f32 s8, s8, s10 +; CHECK-NEXT: vadd.f32 s4, s4, s6 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vstr s12, [r0] ; CHECK-NEXT: add.w r0, r1, r6, lsl #2 ; CHECK-NEXT: vstr s8, [r0] ; CHECK-NEXT: adds r0, r6, #2 @@ -645,26 +645,26 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: letp lr, .LBB4_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1 -; CHECK-NEXT: vadd.f32 s20, s18, s19 +; CHECK-NEXT: vadd.f32 s18, s18, s19 ; CHECK-NEXT: add.w r3, r2, r11, lsl #2 ; CHECK-NEXT: vadd.f32 s16, s16, s17 -; CHECK-NEXT: vadd.f32 s18, s14, s15 +; CHECK-NEXT: vadd.f32 s14, s14, s15 ; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: vadd.f32 s14, s6, s7 +; CHECK-NEXT: vadd.f32 s6, s6, s7 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: vadd.f32 s6, s10, s11 +; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s10, s2, s3 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vadd.f32 s2, s16, s20 -; CHECK-NEXT: vadd.f32 s12, s12, s18 -; CHECK-NEXT: vadd.f32 s4, s4, s14 -; CHECK-NEXT: vadd.f32 s6, s8, s6 -; CHECK-NEXT: vadd.f32 s0, s0, s10 -; CHECK-NEXT: vstr s2, [r3] +; CHECK-NEXT: vadd.f32 s1, s16, s18 +; CHECK-NEXT: vadd.f32 s12, s12, s14 +; CHECK-NEXT: vadd.f32 s2, s2, s3 +; CHECK-NEXT: vadd.f32 s4, s4, s6 +; CHECK-NEXT: vadd.f32 s6, s8, s10 +; CHECK-NEXT: vstr s1, [r3] ; CHECK-NEXT: add.w r3, r2, r0, lsl #2 ; CHECK-NEXT: vstr s12, [r3] ; CHECK-NEXT: adds r3, r0, #2 +; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: add.w r3, r2, r3, lsl #2 ; CHECK-NEXT: vstr s6, [r3] ; CHECK-NEXT: adds r3, r0, #3 @@ -858,32 +858,32 @@ define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: letp lr, .LBB5_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1 -; CHECK-NEXT: vadd.f32 s24, s22, s23 +; CHECK-NEXT: vadd.f32 s22, s22, s23 ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vadd.f32 s20, s20, s21 -; CHECK-NEXT: vadd.f32 s22, s18, s19 +; CHECK-NEXT: vadd.f32 s18, s18, s19 ; CHECK-NEXT: vadd.f32 s16, s16, s17 -; CHECK-NEXT: vadd.f32 s18, s6, s7 -; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: vadd.f32 s6, s14, s15 -; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: vadd.f32 s14, s10, s11 +; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vadd.f32 s10, s2, s3 -; CHECK-NEXT: vadd.f32 s2, s20, s24 -; CHECK-NEXT: vadd.f32 s1, s16, s22 -; CHECK-NEXT: vadd.f32 s6, s12, s6 -; CHECK-NEXT: vadd.f32 s4, s4, s18 -; CHECK-NEXT: vadd.f32 s8, s8, s14 -; CHECK-NEXT: vadd.f32 s0, s0, s10 -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: add.w r1, r2, r0, lsl #2 +; CHECK-NEXT: vadd.f32 s2, s2, s3 +; CHECK-NEXT: vadd.f32 s1, s20, s22 +; CHECK-NEXT: vadd.f32 s6, s6, s7 +; CHECK-NEXT: vadd.f32 s3, s16, s18 +; CHECK-NEXT: vadd.f32 s4, s4, s5 +; CHECK-NEXT: vadd.f32 s8, s8, s10 +; CHECK-NEXT: vadd.f32 s14, s14, s15 +; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vstr s1, [r1] +; CHECK-NEXT: add.w r1, r2, r0, lsl #2 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vstr s3, [r1] ; CHECK-NEXT: adds r1, r0, #2 +; CHECK-NEXT: vadd.f32 s4, s4, s6 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] ; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vadd.f32 s6, s12, s14 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: adds r1, r0, #4 @@ -1089,19 +1089,17 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vmov q5, q4 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q0, [r7] ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q1, q0, q7 +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov q1, q3 -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmov q2, q4 +; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload @@ -1122,32 +1120,32 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vadd.f32 s0, s26, s27 ; CHECK-NEXT: add.w r1, r2, r12, lsl #2 ; CHECK-NEXT: vadd.f32 s2, s24, s25 -; CHECK-NEXT: vadd.f32 s3, s20, s21 ; CHECK-NEXT: vadd.f32 s1, s22, s23 -; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s20, s10, s11 -; CHECK-NEXT: vadd.f32 s11, s14, s15 -; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: vadd.f32 s14, s6, s7 +; CHECK-NEXT: vadd.f32 s3, s20, s21 +; CHECK-NEXT: vadd.f32 s6, s6, s7 ; CHECK-NEXT: vadd.f32 s4, s4, s5 -; CHECK-NEXT: vadd.f32 s10, s18, s19 -; CHECK-NEXT: vadd.f32 s9, s16, s17 +; CHECK-NEXT: vadd.f32 s10, s10, s11 +; CHECK-NEXT: vadd.f32 s8, s8, s9 +; CHECK-NEXT: vadd.f32 s9, s18, s19 +; CHECK-NEXT: vadd.f32 s11, s16, s17 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vadd.f32 s6, s18, s19 -; CHECK-NEXT: vadd.f32 s5, s16, s17 +; CHECK-NEXT: vadd.f32 s5, s18, s19 +; CHECK-NEXT: vadd.f32 s7, s16, s17 ; CHECK-NEXT: vadd.f32 s2, s3, s1 -; CHECK-NEXT: vadd.f32 s4, s4, s14 -; CHECK-NEXT: vadd.f32 s12, s12, s11 -; CHECK-NEXT: vadd.f32 s10, s9, s10 +; CHECK-NEXT: vadd.f32 s4, s4, s6 +; CHECK-NEXT: vadd.f32 s14, s14, s15 +; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s8, s8, s20 -; CHECK-NEXT: vadd.f32 s6, s5, s6 +; CHECK-NEXT: vadd.f32 s8, s8, s10 +; CHECK-NEXT: vadd.f32 s6, s7, s5 ; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 +; CHECK-NEXT: vadd.f32 s10, s11, s9 ; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vadd.f32 s12, s12, s14 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] ; CHECK-NEXT: adds r1, r0, #4 @@ -1408,33 +1406,33 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* ; CHECK-NEXT: vadd.f32 s0, s30, s31 ; CHECK-NEXT: add.w r1, r2, r8, lsl #2 ; CHECK-NEXT: vadd.f32 s2, s28, s29 -; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: vadd.f32 s5, s14, s15 ; CHECK-NEXT: vadd.f32 s4, s26, s27 ; CHECK-NEXT: vadd.f32 s6, s24, s25 -; CHECK-NEXT: vadd.f32 s14, s18, s19 +; CHECK-NEXT: vadd.f32 s5, s18, s19 ; CHECK-NEXT: vadd.f32 s7, s16, s17 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s13, s10, s11 -; CHECK-NEXT: vadd.f32 s10, s18, s19 -; CHECK-NEXT: vadd.f32 s9, s16, s17 +; CHECK-NEXT: vadd.f32 s9, s18, s19 +; CHECK-NEXT: vadd.f32 s11, s16, s17 ; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 s0, s2, s0 -; CHECK-NEXT: vadd.f32 s11, s18, s19 +; CHECK-NEXT: vadd.f32 s14, s14, s15 +; CHECK-NEXT: vadd.f32 s12, s12, s13 +; CHECK-NEXT: vadd.f32 s13, s18, s19 ; CHECK-NEXT: vadd.f32 s15, s16, s17 +; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vadd.f32 s2, s6, s4 -; CHECK-NEXT: vadd.f32 s6, s12, s5 -; CHECK-NEXT: vadd.f32 s12, s7, s14 -; CHECK-NEXT: vadd.f32 s10, s9, s10 +; CHECK-NEXT: vadd.f32 s8, s8, s10 +; CHECK-NEXT: vadd.f32 s10, s11, s9 +; CHECK-NEXT: vadd.f32 s6, s12, s14 +; CHECK-NEXT: vadd.f32 s1, s22, s23 +; CHECK-NEXT: vadd.f32 s14, s15, s13 ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s8, s8, s13 -; CHECK-NEXT: vadd.f32 s14, s15, s11 +; CHECK-NEXT: vadd.f32 s3, s20, s21 ; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: add.w r1, r2, r12, lsl #2 -; CHECK-NEXT: vadd.f32 s1, s22, s23 -; CHECK-NEXT: vadd.f32 s3, s20, s21 +; CHECK-NEXT: vadd.f32 s12, s7, s5 ; CHECK-NEXT: vstr s10, [r1] ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vstr s14, [r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll b/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll index 0f1b483a0eb1e..b4f7a8ca4d47d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll @@ -159,11 +159,11 @@ define arm_aapcs_vfpcc <2 x i64> @build_var0_v2i1(i32 %s, i32 %t, <2 x i64> %a, ; CHECK-LABEL: build_var0_v2i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: vldr s10, .LCPI9_0 ; CHECK-NEXT: csetm r0, lo ; CHECK-NEXT: vmov s8, r0 -; CHECK-NEXT: vldr s10, .LCPI9_0 -; CHECK-NEXT: vmov.f32 s9, s8 ; CHECK-NEXT: vmov.f32 s11, s10 +; CHECK-NEXT: vmov.f32 s9, s8 ; CHECK-NEXT: vbic q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vorr q0, q0, q1 @@ -183,9 +183,9 @@ define arm_aapcs_vfpcc <2 x i64> @build_var1_v2i1(i32 %s, i32 %t, <2 x i64> %a, ; CHECK-LABEL: build_var1_v2i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: vldr s8, .LCPI10_0 ; CHECK-NEXT: csetm r0, lo ; CHECK-NEXT: vmov s10, r0 -; CHECK-NEXT: vldr s8, .LCPI10_0 ; CHECK-NEXT: vmov.f32 s9, s8 ; CHECK-NEXT: vmov.f32 s11, s10 ; CHECK-NEXT: vbic q1, q1, q2 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll index e9ae87165b455..477db0718410e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll @@ -79,9 +79,9 @@ entry: define <4 x i32> @shuffle2_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: shuffle2_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: vmov d0, r0, r1 ; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: vcmp.i32 eq, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp @@ -100,9 +100,9 @@ entry: define <8 x i16> @shuffle2_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: shuffle2_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: vmov d0, r0, r1 ; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: vcmp.i16 eq, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp @@ -121,9 +121,9 @@ entry: define <16 x i8> @shuffle2_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: shuffle2_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: vmov d0, r0, r1 ; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: vcmp.i8 eq, q0, zr ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: mov r0, sp @@ -223,9 +223,9 @@ entry: define <4 x i32> @shuffle4_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: shuffle4_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov d1, r2, r3 -; CHECK-NEXT: vmov.i8 q1, #0xff ; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vmov d1, r2, r3 ; CHECK-NEXT: add r0, sp, #16 ; CHECK-NEXT: vcmp.i32 eq, q0, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll index ca9a725c79abd..3f7b0e6a437b1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -253,16 +253,15 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: mov.w r2, #-1 ; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: vmov.f32 s20, s14 ; CHECK-NEXT: vmov.f32 s18, s11 +; CHECK-NEXT: vmov.f32 s20, s14 ; CHECK-NEXT: vmov.f32 s22, s15 ; CHECK-NEXT: vmullb.s32 q6, q5, q4 -; CHECK-NEXT: vmov.f32 s14, s13 +; CHECK-NEXT: vmov.f32 s10, s9 ; CHECK-NEXT: vmov r4, r7, d13 ; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: vmov.f32 s10, s9 -; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 ; CHECK-NEXT: vmov r6, s12 +; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 ; CHECK-NEXT: sbcs.w r5, r2, r7 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lt @@ -306,10 +305,11 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: csetm r4, ne ; CHECK-NEXT: vmov q5[2], q5[0], r3, r4 ; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov.f32 s10, s13 ; CHECK-NEXT: vbic q6, q1, q5 ; CHECK-NEXT: vand q4, q4, q5 ; CHECK-NEXT: vorr q4, q4, q6 +; CHECK-NEXT: vmov r4, s10 ; CHECK-NEXT: smull r6, r5, r6, r5 ; CHECK-NEXT: asrl r6, r5, #31 ; CHECK-NEXT: smull r4, r7, r4, r3 @@ -522,17 +522,15 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: vorr q4, q4, q0 ; CHECK-NEXT: vpt.u32 cs, q1, q4 ; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 -; CHECK-NEXT: vmov.f32 s24, s18 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q5, [r1], #16 -; CHECK-NEXT: vmov.f32 s28, s22 +; CHECK-NEXT: vmov.f32 s24, s18 ; CHECK-NEXT: vmov.f32 s26, s19 +; CHECK-NEXT: vmov.f32 s28, s22 ; CHECK-NEXT: vmov.f32 s30, s23 ; CHECK-NEXT: vmullb.s32 q0, q7, q6 -; CHECK-NEXT: vmov.f32 s18, s17 ; CHECK-NEXT: vmov r6, r5, d1 ; CHECK-NEXT: asrl r6, r5, #31 -; CHECK-NEXT: vmov.f32 s22, s21 ; CHECK-NEXT: rsbs.w r7, r6, #-2147483648 ; CHECK-NEXT: sbcs.w r7, r12, r5 ; CHECK-NEXT: mov.w r7, #0 @@ -575,11 +573,13 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32* ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: csetm r4, ne ; CHECK-NEXT: vmov q0[2], q0[0], r3, r4 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r4, s22 ; CHECK-NEXT: vbic q7, q3, q0 ; CHECK-NEXT: vand q0, q6, q0 ; CHECK-NEXT: vorr q6, q0, q7 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.f32 s2, s21 +; CHECK-NEXT: vmov r4, s2 ; CHECK-NEXT: smull r6, r5, r4, r3 ; CHECK-NEXT: vmov r4, s16 ; CHECK-NEXT: asrl r6, r5, #31 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll index 0e80c6241c041..a4b3632e4dd7e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll @@ -93,23 +93,23 @@ define arm_aapcs_vfpcc void @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr, < ; CHECK-LABEL: scaled_v8f16_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vshl.i32 q2, q1, #1 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vmov r1, r2, d4 +; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vstr.16 s12, [r2] +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vmov r1, r2, d5 -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vstr.16 s8, [r2] +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vstr.16 s3, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll index f549cebe304e9..fba6524589e59 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll @@ -111,20 +111,20 @@ define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr, ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q2, [r1] ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov r1, r2, d4 ; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vstr.16 s12, [r2] +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vmov r1, r2, d5 -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vstr.16 s8, [r2] +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vstr.16 s3, [r0] @@ -184,20 +184,20 @@ define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr, ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov r1, r2, d4 ; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vstr.16 s12, [r2] +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vmov r1, r2, d5 -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vstr.16 s8, [r2] +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vstr.16 s3, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll index affb361febd68..4c2ef5e01e28c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll @@ -291,8 +291,8 @@ entry: define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) { ; CHECK-LABEL: trunc_signed_unscaled_i64_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vldrb.s32 q2, [r1] +; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vstrw.32 q0, [r0, q2] @@ -310,8 +310,8 @@ entry: define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vldrb.u32 q2, [r1] +; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vstrw.32 q0, [r0, q2] diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll index e8daac426b4cf..edd8a07166e4a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll @@ -312,18 +312,18 @@ define arm_aapcs_vfpcc void @ptr_f16(<8 x half> %v, <8 x half*>* %offptr) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmovx.f16 s12, s0 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vstr.16 s0, [r0] -; CHECK-NEXT: vstr.16 s12, [r1] +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vstr.16 s1, [r0] -; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vstr.16 s3, [r0] @@ -339,10 +339,10 @@ define arm_aapcs_vfpcc void @ptr_v4f16(<4 x half> %v, <4 x half*>* %offptr) { ; CHECK-LABEL: ptr_v4f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: vstr.16 s0, [r0] -; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vstr.16 s0, [r1] ; CHECK-NEXT: vmov r0, r1, d3 ; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vstr.16 s1, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll index 54249151d448e..bdf5fb2354ed2 100644 --- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll @@ -52,30 +52,29 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q5, [r0] -; CHECK-NEXT: vmov.f64 d8, d10 -; CHECK-NEXT: vmov.f32 s18, s21 -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.f32 s2, s21 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov.f64 d12, d11 -; CHECK-NEXT: vmov.f32 s26, s23 +; CHECK-NEXT: vmov.f32 s2, s23 ; CHECK-NEXT: vmov d8, r0, r1 -; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov.f32 s20, s22 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov d11, r0, r1 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 @@ -84,7 +83,7 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS ; CHECK-NEXT: vmov d10, r0, r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} entry: %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index b66e7b24536cf..6ede494c81ea7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -118,8 +118,8 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle2step_i32(<8 x i32> %src) { ; CHECK-NEXT: vmov.f32 s9, s3 ; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: bx lr @@ -135,17 +135,17 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle3step_i32(<16 x i32> %src) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s16, s0 ; CHECK-NEXT: vmov.f32 s13, s4 -; CHECK-NEXT: vmov.f32 s17, s3 ; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s12, s1 ; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vmov.f32 s17, s3 ; CHECK-NEXT: vmov.f32 s19, s9 ; CHECK-NEXT: vadd.i32 q3, q4, q3 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s6, s8 ; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: vadd.i32 q0, q3, q1 ; CHECK-NEXT: vpop {d8, d9} @@ -167,18 +167,18 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle4step_i32(<16 x i32> %src) { ; CHECK-NEXT: vmov.f32 s16, s3 ; CHECK-NEXT: vmov.f32 s20, s2 ; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vmov.f32 s21, s6 ; CHECK-NEXT: vmov.f32 s18, s11 -; CHECK-NEXT: vmov.f32 s22, s10 ; CHECK-NEXT: vmov.f32 s19, s15 +; CHECK-NEXT: vmov.f32 s21, s6 +; CHECK-NEXT: vmov.f32 s22, s10 ; CHECK-NEXT: vmov.f32 s23, s14 ; CHECK-NEXT: vadd.i32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s20, s1 ; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vmov.f32 s22, s9 -; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s3, s12 ; CHECK-NEXT: vadd.i32 q0, q0, q5 ; CHECK-NEXT: vadd.i32 q0, q0, q4 @@ -202,12 +202,12 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vins.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s1, s6 -; CHECK-NEXT: vins.f16 s1, s6 ; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: @@ -228,14 +228,14 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) { ; CHECK-LABEL: shuffle3_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s1, s7 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vins.f16 s1, s7 ; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s0, s4 ; CHECK-NEXT: vins.f16 s5, s4 -; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vins.f16 s2, s0 ; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmovx.f16 s1, s7 ; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vins.f16 s1, s7 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> @@ -323,30 +323,27 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle2step_i16(<16 x i16> %src) { ; CHECK-LABEL: shuffle2step_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vmovx.f16 s9, s2 +; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vmovx.f16 s10, s4 -; CHECK-NEXT: vmovx.f16 s16, s1 ; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vins.f16 s8, s16 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vins.f16 s8, s0 ; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vmovx.f16 s10, s4 ; CHECK-NEXT: vins.f16 s9, s0 ; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vins.f16 s4, s5 ; CHECK-NEXT: vins.f16 s10, s0 -; CHECK-NEXT: vmov.f32 s13, s2 -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vmovx.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s11, s6 +; CHECK-NEXT: vmovx.f16 s0, s7 ; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vmov.f32 s13, s2 ; CHECK-NEXT: vins.f16 s11, s0 +; CHECK-NEXT: vmov.f32 s14, s4 ; CHECK-NEXT: vmov.f32 s15, s6 ; CHECK-NEXT: vadd.i16 q0, q3, q2 -; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> @@ -358,51 +355,54 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle3step_i16(<32 x i16> %src) { ; CHECK-LABEL: shuffle3step_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmovx.f16 s16, s1 +; CHECK-NEXT: .vsave {d11, d12, d13} +; CHECK-NEXT: vpush {d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s12, s0 -; CHECK-NEXT: vins.f16 s12, s16 -; CHECK-NEXT: vmovx.f16 s16, s4 +; CHECK-NEXT: vmovx.f16 s14, s1 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vmovx.f16 s14, s4 ; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmovx.f16 s20, s5 -; CHECK-NEXT: vins.f16 s13, s16 -; CHECK-NEXT: vmovx.f16 s16, s7 +; CHECK-NEXT: vmovx.f16 s15, s7 +; CHECK-NEXT: vins.f16 s13, s14 ; CHECK-NEXT: vmov.f32 s14, s6 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vins.f16 s14, s16 -; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vins.f16 s1, s16 +; CHECK-NEXT: vins.f16 s14, s15 +; CHECK-NEXT: vmovx.f16 s15, s2 +; CHECK-NEXT: vins.f16 s1, s15 +; CHECK-NEXT: vmovx.f16 s15, s5 ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmovx.f16 s20, s10 -; CHECK-NEXT: vmov.f32 s15, s9 -; CHECK-NEXT: vins.f16 s15, s20 -; CHECK-NEXT: vmovx.f16 s20, s11 -; CHECK-NEXT: vins.f16 s10, s20 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vins.f16 s17, s15 ; CHECK-NEXT: vmov.f32 s16, s1 +; CHECK-NEXT: vmovx.f16 s1, s10 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vins.f16 s15, s1 +; CHECK-NEXT: vmovx.f16 s1, s11 +; CHECK-NEXT: vins.f16 s10, s1 +; CHECK-NEXT: vmovx.f16 s1, s3 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmovx.f16 s7, s9 ; CHECK-NEXT: vmov.f32 s23, s10 ; CHECK-NEXT: vmov.f32 s22, s8 -; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s1, s5 ; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vins.f16 s6, s8 +; CHECK-NEXT: vins.f16 s7, s11 ; CHECK-NEXT: vmovnb.i32 q6, q4 +; CHECK-NEXT: vmov.f32 s19, s10 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmovnb.i32 q2, q0 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmov.f32 s2, s10 ; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmovx.f16 s20, s0 -; CHECK-NEXT: vins.f16 s20, s2 -; CHECK-NEXT: vmovx.f16 s21, s3 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vmovx.f16 s3, s9 -; CHECK-NEXT: vins.f16 s21, s5 -; CHECK-NEXT: vins.f16 s3, s11 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovnb.i32 q1, q5 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s23, s3 -; CHECK-NEXT: vadd.i16 q0, q3, q5 +; CHECK-NEXT: vadd.i16 q0, q3, q0 ; CHECK-NEXT: vadd.i16 q0, q0, q4 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d11, d12, d13} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> @@ -416,53 +416,51 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) { ; CHECK-LABEL: shuffle4step_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmovx.f16 s18, s9 -; CHECK-NEXT: vins.f16 s18, s20 -; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmovx.f16 s16, s11 +; CHECK-NEXT: vins.f16 s18, s16 ; CHECK-NEXT: vmovx.f16 s19, s13 -; CHECK-NEXT: vins.f16 s9, s11 -; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmovx.f16 s16, s15 ; CHECK-NEXT: vmovx.f16 s20, s3 +; CHECK-NEXT: vins.f16 s19, s16 ; CHECK-NEXT: vmovx.f16 s16, s1 -; CHECK-NEXT: vins.f16 s13, s15 ; CHECK-NEXT: vins.f16 s16, s20 -; CHECK-NEXT: vmovx.f16 s20, s7 ; CHECK-NEXT: vmovx.f16 s17, s5 +; CHECK-NEXT: vmovx.f16 s20, s7 +; CHECK-NEXT: vins.f16 s9, s11 +; CHECK-NEXT: vins.f16 s13, s15 +; CHECK-NEXT: vins.f16 s5, s7 ; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vins.f16 s17, s20 +; CHECK-NEXT: vmov.f32 s20, s1 +; CHECK-NEXT: vmovx.f16 s1, s10 ; CHECK-NEXT: vmov.f32 s22, s9 ; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vmov.f32 s20, s1 -; CHECK-NEXT: vmovx.f16 s24, s10 ; CHECK-NEXT: vmov.f32 s21, s5 ; CHECK-NEXT: vadd.i16 q4, q5, q4 ; CHECK-NEXT: vmovx.f16 s22, s8 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s24, s14 +; CHECK-NEXT: vins.f16 s22, s1 ; CHECK-NEXT: vmovx.f16 s23, s12 -; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vmov.f32 s10, s8 -; CHECK-NEXT: vmov.f32 s11, s12 -; CHECK-NEXT: vins.f16 s23, s24 -; CHECK-NEXT: vmovx.f16 s24, s2 +; CHECK-NEXT: vmovx.f16 s1, s14 ; CHECK-NEXT: vmovx.f16 s20, s0 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s6 +; CHECK-NEXT: vins.f16 s23, s1 +; CHECK-NEXT: vmovx.f16 s1, s2 +; CHECK-NEXT: vins.f16 s20, s1 ; CHECK-NEXT: vmovx.f16 s21, s4 -; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vins.f16 s8, s10 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vins.f16 s21, s24 +; CHECK-NEXT: vins.f16 s21, s1 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmov.f32 s3, s12 ; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vadd.i16 q0, q0, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> @@ -769,12 +767,11 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle3step_i8(<64 x i8> %src) { ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[14] ; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] ; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov.f32 s15, s19 ; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.f32 s14, s22 -; CHECK-NEXT: vmov.f32 s15, s19 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[3] ; CHECK-NEXT: vmov.8 q4[1], r0 @@ -797,19 +794,20 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle3step_i8(<64 x i8> %src) { ; CHECK-NEXT: vmov.u8 r0, q1[14] ; CHECK-NEXT: vmov.8 q4[10], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] +; CHECK-NEXT: vmov.f32 s14, s22 ; CHECK-NEXT: vmov.8 q5[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[7] +; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vmov.8 q5[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[10] ; CHECK-NEXT: vmov.8 q5[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[13] ; CHECK-NEXT: vmov.8 q5[15], r0 ; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vmov.8 q6[11], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.f32 s18, s26 ; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: vadd.i8 q3, q4, q3 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[5] @@ -1028,8 +1026,8 @@ define arm_aapcs_vfpcc <2 x i64> @shuffle2_i64(<2 x i64> %src) { ; CHECK-LABEL: shuffle2_i64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -1146,8 +1144,8 @@ define arm_aapcs_vfpcc <4 x float> @shuffle2step_f32(<8 x float> %src) { ; CHECKFP-NEXT: vmov.f32 s9, s3 ; CHECKFP-NEXT: vmov.f32 s1, s2 ; CHECKFP-NEXT: vmov.f32 s10, s5 -; CHECKFP-NEXT: vmov.f32 s2, s4 ; CHECKFP-NEXT: vmov.f32 s11, s7 +; CHECKFP-NEXT: vmov.f32 s2, s4 ; CHECKFP-NEXT: vmov.f32 s3, s6 ; CHECKFP-NEXT: vadd.f32 q0, q0, q2 ; CHECKFP-NEXT: bx lr @@ -1163,17 +1161,17 @@ define arm_aapcs_vfpcc <4 x float> @shuffle3step_f32(<16 x float> %src) { ; CHECKFP: @ %bb.0: @ %entry ; CHECKFP-NEXT: .vsave {d8, d9} ; CHECKFP-NEXT: vpush {d8, d9} -; CHECKFP-NEXT: vmov.f32 s12, s1 -; CHECKFP-NEXT: vmov.f32 s16, s0 ; CHECKFP-NEXT: vmov.f32 s13, s4 -; CHECKFP-NEXT: vmov.f32 s17, s3 ; CHECKFP-NEXT: vmov.f32 s14, s7 ; CHECKFP-NEXT: vmov.f32 s18, s6 -; CHECKFP-NEXT: vmov.f32 s4, s2 -; CHECKFP-NEXT: vmov.f32 s6, s8 +; CHECKFP-NEXT: vmov.f32 s12, s1 ; CHECKFP-NEXT: vmov.f32 s15, s10 +; CHECKFP-NEXT: vmov.f32 s16, s0 +; CHECKFP-NEXT: vmov.f32 s17, s3 ; CHECKFP-NEXT: vmov.f32 s19, s9 ; CHECKFP-NEXT: vadd.f32 q3, q4, q3 +; CHECKFP-NEXT: vmov.f32 s4, s2 +; CHECKFP-NEXT: vmov.f32 s6, s8 ; CHECKFP-NEXT: vmov.f32 s7, s11 ; CHECKFP-NEXT: vadd.f32 q0, q3, q1 ; CHECKFP-NEXT: vpop {d8, d9} @@ -1195,18 +1193,18 @@ define arm_aapcs_vfpcc <4 x float> @shuffle4step_f32(<16 x float> %src) { ; CHECKFP-NEXT: vmov.f32 s16, s3 ; CHECKFP-NEXT: vmov.f32 s20, s2 ; CHECKFP-NEXT: vmov.f32 s17, s7 -; CHECKFP-NEXT: vmov.f32 s21, s6 ; CHECKFP-NEXT: vmov.f32 s18, s11 -; CHECKFP-NEXT: vmov.f32 s22, s10 ; CHECKFP-NEXT: vmov.f32 s19, s15 +; CHECKFP-NEXT: vmov.f32 s21, s6 +; CHECKFP-NEXT: vmov.f32 s22, s10 ; CHECKFP-NEXT: vmov.f32 s23, s14 ; CHECKFP-NEXT: vadd.f32 q4, q5, q4 ; CHECKFP-NEXT: vmov.f32 s20, s1 ; CHECKFP-NEXT: vmov.f32 s21, s5 -; CHECKFP-NEXT: vmov.f32 s1, s4 ; CHECKFP-NEXT: vmov.f32 s22, s9 -; CHECKFP-NEXT: vmov.f32 s2, s8 ; CHECKFP-NEXT: vmov.f32 s23, s13 +; CHECKFP-NEXT: vmov.f32 s1, s4 +; CHECKFP-NEXT: vmov.f32 s2, s8 ; CHECKFP-NEXT: vmov.f32 s3, s12 ; CHECKFP-NEXT: vadd.f32 q0, q0, q5 ; CHECKFP-NEXT: vadd.f32 q0, q0, q4 @@ -1230,12 +1228,12 @@ define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vins.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s1, s6 -; CHECK-NEXT: vins.f16 s1, s6 ; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: @@ -1256,14 +1254,14 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) { ; CHECK-LABEL: shuffle3_f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s1, s7 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vins.f16 s1, s7 ; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s0, s4 ; CHECK-NEXT: vins.f16 s5, s4 -; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vins.f16 s2, s0 ; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmovx.f16 s1, s7 ; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vins.f16 s1, s7 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> @@ -1340,24 +1338,24 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle2step_f16(<16 x half> %src) { ; CHECKFP-LABEL: shuffle2step_f16: ; CHECKFP: @ %bb.0: @ %entry -; CHECKFP-NEXT: vmovx.f16 s12, s1 ; CHECKFP-NEXT: vmovx.f16 s8, s0 -; CHECKFP-NEXT: vins.f16 s8, s12 -; CHECKFP-NEXT: vmovx.f16 s12, s3 +; CHECKFP-NEXT: vmovx.f16 s10, s1 +; CHECKFP-NEXT: vins.f16 s8, s10 ; CHECKFP-NEXT: vmovx.f16 s9, s2 -; CHECKFP-NEXT: vins.f16 s0, s1 -; CHECKFP-NEXT: vins.f16 s9, s12 -; CHECKFP-NEXT: vins.f16 s2, s3 +; CHECKFP-NEXT: vmovx.f16 s10, s3 ; CHECKFP-NEXT: vmovx.f16 s12, s5 +; CHECKFP-NEXT: vins.f16 s9, s10 ; CHECKFP-NEXT: vmovx.f16 s10, s4 ; CHECKFP-NEXT: vins.f16 s10, s12 -; CHECKFP-NEXT: vins.f16 s4, s5 -; CHECKFP-NEXT: vmov.f32 s1, s2 -; CHECKFP-NEXT: vmovx.f16 s12, s7 ; CHECKFP-NEXT: vmovx.f16 s11, s6 +; CHECKFP-NEXT: vmovx.f16 s12, s7 +; CHECKFP-NEXT: vins.f16 s2, s3 ; CHECKFP-NEXT: vins.f16 s6, s7 -; CHECKFP-NEXT: vmov.f32 s2, s4 +; CHECKFP-NEXT: vins.f16 s4, s5 +; CHECKFP-NEXT: vins.f16 s0, s1 +; CHECKFP-NEXT: vmov.f32 s1, s2 ; CHECKFP-NEXT: vins.f16 s11, s12 +; CHECKFP-NEXT: vmov.f32 s2, s4 ; CHECKFP-NEXT: vmov.f32 s3, s6 ; CHECKFP-NEXT: vadd.f16 q0, q0, q2 ; CHECKFP-NEXT: bx lr @@ -1371,45 +1369,43 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) { ; CHECKFP-LABEL: shuffle3step_f16: ; CHECKFP: @ %bb.0: @ %entry -; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECKFP-NEXT: vmovx.f16 s16, s2 +; CHECKFP-NEXT: .vsave {d8, d9, d10} +; CHECKFP-NEXT: vpush {d8, d9, d10} ; CHECKFP-NEXT: vmov.f32 s12, s1 -; CHECKFP-NEXT: vins.f16 s12, s16 -; CHECKFP-NEXT: vmovx.f16 s16, s5 +; CHECKFP-NEXT: vmovx.f16 s14, s2 +; CHECKFP-NEXT: vins.f16 s12, s14 ; CHECKFP-NEXT: vmov.f32 s13, s4 -; CHECKFP-NEXT: vmovx.f16 s20, s11 -; CHECKFP-NEXT: vins.f16 s13, s16 -; CHECKFP-NEXT: vmov.f32 s19, s10 -; CHECKFP-NEXT: vins.f16 s19, s20 +; CHECKFP-NEXT: vmovx.f16 s14, s5 +; CHECKFP-NEXT: vmov.f32 s15, s10 +; CHECKFP-NEXT: vins.f16 s13, s14 +; CHECKFP-NEXT: vmovx.f16 s14, s11 +; CHECKFP-NEXT: vins.f16 s15, s14 ; CHECKFP-NEXT: vmov.f32 s14, s7 -; CHECKFP-NEXT: vmovx.f16 s20, s8 -; CHECKFP-NEXT: vmov.f32 s28, s6 -; CHECKFP-NEXT: vins.f16 s14, s20 -; CHECKFP-NEXT: vmovx.f16 s20, s7 -; CHECKFP-NEXT: vins.f16 s28, s20 -; CHECKFP-NEXT: vmovx.f16 s24, s1 -; CHECKFP-NEXT: vmovx.f16 s20, s0 -; CHECKFP-NEXT: vins.f16 s0, s24 -; CHECKFP-NEXT: vins.f16 s20, s2 -; CHECKFP-NEXT: vmovx.f16 s26, s4 -; CHECKFP-NEXT: vmovx.f16 s21, s3 -; CHECKFP-NEXT: vins.f16 s3, s26 -; CHECKFP-NEXT: vins.f16 s21, s5 -; CHECKFP-NEXT: vmovx.f16 s30, s10 -; CHECKFP-NEXT: vmovx.f16 s23, s9 -; CHECKFP-NEXT: vmov.f32 s18, s8 +; CHECKFP-NEXT: vmovx.f16 s16, s8 +; CHECKFP-NEXT: vmovx.f16 s4, s4 +; CHECKFP-NEXT: vmovx.f16 s7, s7 +; CHECKFP-NEXT: vmov.f32 s20, s6 +; CHECKFP-NEXT: vmovx.f16 s10, s10 +; CHECKFP-NEXT: vmovx.f16 s17, s3 +; CHECKFP-NEXT: vmovx.f16 s19, s9 +; CHECKFP-NEXT: vmovx.f16 s18, s6 +; CHECKFP-NEXT: vins.f16 s14, s16 +; CHECKFP-NEXT: vmovx.f16 s16, s0 +; CHECKFP-NEXT: vmovx.f16 s1, s1 +; CHECKFP-NEXT: vins.f16 s20, s7 +; CHECKFP-NEXT: vins.f16 s3, s4 +; CHECKFP-NEXT: vins.f16 s9, s10 +; CHECKFP-NEXT: vins.f16 s0, s1 +; CHECKFP-NEXT: vins.f16 s16, s2 ; CHECKFP-NEXT: vmov.f32 s1, s3 -; CHECKFP-NEXT: vins.f16 s9, s30 -; CHECKFP-NEXT: vins.f16 s23, s11 -; CHECKFP-NEXT: vmov.f32 s2, s28 -; CHECKFP-NEXT: vmovx.f16 s22, s6 +; CHECKFP-NEXT: vins.f16 s17, s5 +; CHECKFP-NEXT: vins.f16 s19, s11 +; CHECKFP-NEXT: vins.f16 s18, s8 +; CHECKFP-NEXT: vmov.f32 s2, s20 ; CHECKFP-NEXT: vmov.f32 s3, s9 -; CHECKFP-NEXT: vins.f16 s22, s8 -; CHECKFP-NEXT: vmov.f32 s15, s19 -; CHECKFP-NEXT: vadd.f16 q0, q0, q5 +; CHECKFP-NEXT: vadd.f16 q0, q0, q4 ; CHECKFP-NEXT: vadd.f16 q0, q0, q3 -; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECKFP-NEXT: vpop {d8, d9, d10} ; CHECKFP-NEXT: bx lr entry: %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> @@ -1425,47 +1421,47 @@ define arm_aapcs_vfpcc <8 x half> @shuffle4step_f16(<32 x half> %src) { ; CHECKFP: @ %bb.0: @ %entry ; CHECKFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECKFP-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECKFP-NEXT: vmovx.f16 s20, s11 ; CHECKFP-NEXT: vmovx.f16 s18, s9 -; CHECKFP-NEXT: vins.f16 s18, s20 -; CHECKFP-NEXT: vmovx.f16 s20, s15 +; CHECKFP-NEXT: vmovx.f16 s16, s11 +; CHECKFP-NEXT: vins.f16 s18, s16 ; CHECKFP-NEXT: vmovx.f16 s19, s13 -; CHECKFP-NEXT: vins.f16 s9, s11 -; CHECKFP-NEXT: vins.f16 s19, s20 -; CHECKFP-NEXT: vmovx.f16 s20, s3 +; CHECKFP-NEXT: vmovx.f16 s16, s15 +; CHECKFP-NEXT: vmovx.f16 s22, s8 +; CHECKFP-NEXT: vins.f16 s19, s16 ; CHECKFP-NEXT: vmovx.f16 s16, s1 -; CHECKFP-NEXT: vmovx.f16 s24, s10 +; CHECKFP-NEXT: vmovx.f16 s20, s3 +; CHECKFP-NEXT: vins.f16 s1, s3 +; CHECKFP-NEXT: vmovx.f16 s3, s10 ; CHECKFP-NEXT: vins.f16 s16, s20 -; CHECKFP-NEXT: vmovx.f16 s20, s7 ; CHECKFP-NEXT: vmovx.f16 s17, s5 -; CHECKFP-NEXT: vins.f16 s13, s15 -; CHECKFP-NEXT: vins.f16 s17, s20 -; CHECKFP-NEXT: vmovx.f16 s22, s8 -; CHECKFP-NEXT: vins.f16 s22, s24 -; CHECKFP-NEXT: vmovx.f16 s24, s14 +; CHECKFP-NEXT: vmovx.f16 s20, s7 +; CHECKFP-NEXT: vins.f16 s22, s3 ; CHECKFP-NEXT: vmovx.f16 s23, s12 -; CHECKFP-NEXT: vins.f16 s1, s3 -; CHECKFP-NEXT: vins.f16 s23, s24 -; CHECKFP-NEXT: vmovx.f16 s24, s2 +; CHECKFP-NEXT: vmovx.f16 s3, s14 +; CHECKFP-NEXT: vins.f16 s17, s20 +; CHECKFP-NEXT: vins.f16 s23, s3 ; CHECKFP-NEXT: vmovx.f16 s20, s0 +; CHECKFP-NEXT: vmovx.f16 s3, s2 +; CHECKFP-NEXT: vins.f16 s9, s11 +; CHECKFP-NEXT: vins.f16 s13, s15 ; CHECKFP-NEXT: vins.f16 s5, s7 -; CHECKFP-NEXT: vins.f16 s20, s24 -; CHECKFP-NEXT: vmovx.f16 s24, s6 +; CHECKFP-NEXT: vins.f16 s20, s3 ; CHECKFP-NEXT: vmovx.f16 s21, s4 +; CHECKFP-NEXT: vmovx.f16 s3, s6 ; CHECKFP-NEXT: vins.f16 s8, s10 -; CHECKFP-NEXT: vins.f16 s21, s24 -; CHECKFP-NEXT: vmov.f32 s26, s9 ; CHECKFP-NEXT: vins.f16 s12, s14 -; CHECKFP-NEXT: vins.f16 s0, s2 -; CHECKFP-NEXT: vmov.f32 s27, s13 ; CHECKFP-NEXT: vins.f16 s4, s6 +; CHECKFP-NEXT: vins.f16 s21, s3 +; CHECKFP-NEXT: vins.f16 s0, s2 ; CHECKFP-NEXT: vmov.f32 s24, s1 +; CHECKFP-NEXT: vmov.f32 s26, s9 +; CHECKFP-NEXT: vmov.f32 s27, s13 +; CHECKFP-NEXT: vmov.f32 s25, s5 ; CHECKFP-NEXT: vmov.f32 s2, s8 +; CHECKFP-NEXT: vadd.f16 q4, q6, q4 ; CHECKFP-NEXT: vmov.f32 s3, s12 ; CHECKFP-NEXT: vmov.f32 s1, s4 -; CHECKFP-NEXT: vmov.f32 s25, s5 ; CHECKFP-NEXT: vadd.f16 q0, q0, q5 -; CHECKFP-NEXT: vadd.f16 q4, q6, q4 ; CHECKFP-NEXT: vadd.f16 q0, q0, q4 ; CHECKFP-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECKFP-NEXT: bx lr @@ -1495,8 +1491,8 @@ define arm_aapcs_vfpcc <2 x double> @shuffle2_f64(<2 x double> %src) { ; CHECK-LABEL: shuffle2_f64: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -1559,7 +1555,6 @@ entry: define arm_aapcs_vfpcc <4 x float> @insert_f32(float %a) { ; CHECK-LABEL: insert_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: bx lr entry: %res = insertelement <4 x float> undef, float %a, i32 0 @@ -1569,7 +1564,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @insert_f16(half %a) { ; CHECK-LABEL: insert_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: bx lr entry: %res = insertelement <8 x half> undef, half %a, i32 0 @@ -1579,7 +1573,6 @@ entry: define arm_aapcs_vfpcc <2 x double> @insert_f64(double %a) { ; CHECK-LABEL: insert_f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: bx lr entry: %res = insertelement <2 x double> undef, double %a, i32 0 @@ -1696,7 +1689,6 @@ entry: define arm_aapcs_vfpcc float @extract_f32_0(<4 x float> %a) { ; CHECK-LABEL: extract_f32_0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bx lr entry: %res = extractelement <4 x float> %a, i32 0 @@ -1716,7 +1708,6 @@ entry: define arm_aapcs_vfpcc half @extract_f16_0(<8 x half> %a) { ; CHECK-LABEL: extract_f16_0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bx lr entry: %res = extractelement <8 x half> %a, i32 0 @@ -1736,7 +1727,6 @@ entry: define arm_aapcs_vfpcc double @extract_f64_0(<2 x double> %a) { ; CHECK-LABEL: extract_f64_0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bx lr entry: %res = extractelement <2 x double> %a, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll index 043f7d9576a3b..b487407eefa5a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll @@ -52,10 +52,10 @@ define arm_aapcs_vfpcc <4 x i32> @sext_i32_1357_swapped(<8 x i16> %src) { ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrh.s32 q2, [r0] +; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vldrh.s32 q1, [r0, #8] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: add sp, #16 @@ -94,9 +94,9 @@ define arm_aapcs_vfpcc <8 x i32> @sext_i32_02468101214_swapped(<16 x i16> %src) ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: @@ -126,17 +126,17 @@ define arm_aapcs_vfpcc <8 x i32> @sext_i32_13579111315_swapped(<16 x i16> %src) ; CHECK-NEXT: add r1, sp, #16 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vldrh.s32 q2, [r0] +; CHECK-NEXT: vldrh.s32 q0, [r0] ; CHECK-NEXT: vldrh.s32 q1, [r0, #8] -; CHECK-NEXT: vldrh.s32 q3, [r1] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov.f32 s1, s11 ; CHECK-NEXT: vldrh.s32 q2, [r1, #8] +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.f32 s4, s13 -; CHECK-NEXT: vmov.f32 s5, s15 +; CHECK-NEXT: vldrh.s32 q1, [r1] ; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr @@ -195,10 +195,10 @@ define arm_aapcs_vfpcc <4 x i32> @zext_i32_1357_swapped(<8 x i16> %src) { ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrh.u32 q2, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: vldrh.u32 q1, [r0, #8] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: add sp, #16 @@ -237,9 +237,9 @@ define arm_aapcs_vfpcc <8 x i32> @zext_i32_02468101214_swapped(<16 x i16> %src) ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: @@ -269,17 +269,17 @@ define arm_aapcs_vfpcc <8 x i32> @zext_i32_13579111315_swapped(<16 x i16> %src) ; CHECK-NEXT: add r1, sp, #16 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vldrh.u32 q2, [r0] +; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: vldrh.u32 q1, [r0, #8] -; CHECK-NEXT: vldrh.u32 q3, [r1] -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov.f32 s1, s11 ; CHECK-NEXT: vldrh.u32 q2, [r1, #8] +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s2, s5 ; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.f32 s4, s13 -; CHECK-NEXT: vmov.f32 s5, s15 +; CHECK-NEXT: vldrh.u32 q1, [r1] ; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll index ce08e69b6816c..7318ec8077deb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll @@ -37,12 +37,12 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_76543210(<8 x i16> %s1, <8 x i16> ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vins.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s1, s6 -; CHECK-NEXT: vins.f16 s1, s6 ; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: @@ -340,12 +340,12 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_f16_76543210(<8 x half> %s1, <8 x hal ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vins.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s1, s6 -; CHECK-NEXT: vins.f16 s1, s6 ; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vins.f16 s2, s5 ; CHECK-NEXT: vins.f16 s3, s4 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll index a71adb8a655d1..d145b6a61737b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll +++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll @@ -56,11 +56,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @add_float32_t(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: add_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vadd.f32 s11, s7, s3 -; CHECK-MVE-NEXT: vadd.f32 s10, s6, s2 -; CHECK-MVE-NEXT: vadd.f32 s9, s5, s1 -; CHECK-MVE-NEXT: vadd.f32 s8, s4, s0 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vadd.f32 s3, s7, s3 +; CHECK-MVE-NEXT: vadd.f32 s2, s6, s2 +; CHECK-MVE-NEXT: vadd.f32 s1, s5, s1 +; CHECK-MVE-NEXT: vadd.f32 s0, s4, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: add_float32_t: @@ -75,27 +74,26 @@ entry: define arm_aapcs_vfpcc <8 x half> @add_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-MVE-LABEL: add_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vmovx.f16 s2, s4 -; CHECK-MVE-NEXT: vmovx.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vadd.f16 s12, s2, s0 -; CHECK-MVE-NEXT: vadd.f16 s0, s4, s8 -; CHECK-MVE-NEXT: vins.f16 s0, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vadd.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vins.f16 s1, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vadd.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vins.f16 s2, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vadd.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vins.f16 s3, s12 +; CHECK-MVE-NEXT: vmovx.f16 s8, s0 +; CHECK-MVE-NEXT: vmovx.f16 s10, s4 +; CHECK-MVE-NEXT: vadd.f16 s0, s4, s0 +; CHECK-MVE-NEXT: vadd.f16 s8, s10, s8 +; CHECK-MVE-NEXT: vins.f16 s0, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vadd.f16 s1, s5, s1 +; CHECK-MVE-NEXT: vadd.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vadd.f16 s2, s6, s2 +; CHECK-MVE-NEXT: vadd.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 +; CHECK-MVE-NEXT: vadd.f16 s3, s7, s3 +; CHECK-MVE-NEXT: vadd.f16 s4, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: add_float16_t: @@ -189,11 +187,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @sub_float32_t(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: sub_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vsub.f32 s11, s7, s3 -; CHECK-MVE-NEXT: vsub.f32 s10, s6, s2 -; CHECK-MVE-NEXT: vsub.f32 s9, s5, s1 -; CHECK-MVE-NEXT: vsub.f32 s8, s4, s0 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vsub.f32 s3, s7, s3 +; CHECK-MVE-NEXT: vsub.f32 s2, s6, s2 +; CHECK-MVE-NEXT: vsub.f32 s1, s5, s1 +; CHECK-MVE-NEXT: vsub.f32 s0, s4, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: sub_float32_t: @@ -208,27 +205,26 @@ entry: define arm_aapcs_vfpcc <8 x half> @sub_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-MVE-LABEL: sub_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vmovx.f16 s2, s4 -; CHECK-MVE-NEXT: vmovx.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vsub.f16 s12, s2, s0 -; CHECK-MVE-NEXT: vsub.f16 s0, s4, s8 -; CHECK-MVE-NEXT: vins.f16 s0, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vsub.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vins.f16 s1, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vsub.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vins.f16 s2, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vsub.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vins.f16 s3, s12 +; CHECK-MVE-NEXT: vmovx.f16 s8, s0 +; CHECK-MVE-NEXT: vmovx.f16 s10, s4 +; CHECK-MVE-NEXT: vsub.f16 s0, s4, s0 +; CHECK-MVE-NEXT: vsub.f16 s8, s10, s8 +; CHECK-MVE-NEXT: vins.f16 s0, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vsub.f16 s1, s5, s1 +; CHECK-MVE-NEXT: vsub.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vsub.f16 s2, s6, s2 +; CHECK-MVE-NEXT: vsub.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 +; CHECK-MVE-NEXT: vsub.f16 s3, s7, s3 +; CHECK-MVE-NEXT: vsub.f16 s4, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: sub_float16_t: @@ -324,27 +320,26 @@ entry: define arm_aapcs_vfpcc <8 x half> @mul_float16_t(<8 x half> %src1, <8 x half> %src2) { ; CHECK-MVE-LABEL: mul_float16_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vmovx.f16 s2, s4 -; CHECK-MVE-NEXT: vmovx.f16 s0, s8 -; CHECK-MVE-NEXT: vmovx.f16 s14, s5 -; CHECK-MVE-NEXT: vmul.f16 s12, s2, s0 -; CHECK-MVE-NEXT: vmul.f16 s0, s4, s8 -; CHECK-MVE-NEXT: vins.f16 s0, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s9 -; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmul.f16 s1, s5, s9 -; CHECK-MVE-NEXT: vins.f16 s1, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s10 -; CHECK-MVE-NEXT: vmovx.f16 s14, s6 -; CHECK-MVE-NEXT: vmul.f16 s2, s6, s10 -; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmovx.f16 s14, s7 -; CHECK-MVE-NEXT: vins.f16 s2, s12 -; CHECK-MVE-NEXT: vmovx.f16 s12, s11 -; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12 -; CHECK-MVE-NEXT: vmul.f16 s3, s7, s11 -; CHECK-MVE-NEXT: vins.f16 s3, s12 +; CHECK-MVE-NEXT: vmovx.f16 s8, s0 +; CHECK-MVE-NEXT: vmovx.f16 s10, s4 +; CHECK-MVE-NEXT: vmul.f16 s0, s4, s0 +; CHECK-MVE-NEXT: vmul.f16 s8, s10, s8 +; CHECK-MVE-NEXT: vins.f16 s0, s8 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s5 +; CHECK-MVE-NEXT: vmul.f16 s1, s5, s1 +; CHECK-MVE-NEXT: vmul.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s8, s6 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vmul.f16 s2, s6, s2 +; CHECK-MVE-NEXT: vmul.f16 s4, s8, s4 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s7 +; CHECK-MVE-NEXT: vmul.f16 s3, s7, s3 +; CHECK-MVE-NEXT: vmul.f16 s4, s6, s4 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: mul_float16_t: @@ -359,11 +354,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @mul_float32_t(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: mul_float32_t: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmul.f32 s11, s7, s3 -; CHECK-MVE-NEXT: vmul.f32 s10, s6, s2 -; CHECK-MVE-NEXT: vmul.f32 s9, s5, s1 -; CHECK-MVE-NEXT: vmul.f32 s8, s4, s0 -; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: vmul.f32 s3, s7, s3 +; CHECK-MVE-NEXT: vmul.f32 s2, s6, s2 +; CHECK-MVE-NEXT: vmul.f32 s1, s5, s1 +; CHECK-MVE-NEXT: vmul.f32 s0, s4, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: mul_float32_t: diff --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll index 488a20bc9602f..4b76906034057 100644 --- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll +++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll @@ -6,10 +6,10 @@ define <16 x i8> @vector_add_i8(<16 x i8> %lhs, <16 x i8> %rhs) { ; CHECK-LE-LABEL: vector_add_i8: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vmov d0, r0, r1 ; CHECK-LE-NEXT: mov r0, sp ; CHECK-LE-NEXT: vldrw.u32 q1, [r0] +; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vadd.i8 q0, q0, q1 ; CHECK-LE-NEXT: vmov r0, r1, d0 ; CHECK-LE-NEXT: vmov r2, r3, d1 @@ -17,9 +17,9 @@ define <16 x i8> @vector_add_i8(<16 x i8> %lhs, <16 x i8> %rhs) { ; ; CHECK-BE-LABEL: vector_add_i8: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vmov d0, r1, r0 ; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: vldrb.u8 q0, [r0] ; CHECK-BE-NEXT: vadd.i8 q0, q1, q0 @@ -35,10 +35,10 @@ entry: define <8 x i16> @vector_add_i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; CHECK-LE-LABEL: vector_add_i16: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vmov d0, r0, r1 ; CHECK-LE-NEXT: mov r0, sp ; CHECK-LE-NEXT: vldrw.u32 q1, [r0] +; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vadd.i16 q0, q0, q1 ; CHECK-LE-NEXT: vmov r0, r1, d0 ; CHECK-LE-NEXT: vmov r2, r3, d1 @@ -46,9 +46,9 @@ define <8 x i16> @vector_add_i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; ; CHECK-BE-LABEL: vector_add_i16: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vmov d0, r1, r0 ; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vldrh.u16 q0, [r0] ; CHECK-BE-NEXT: vadd.i16 q0, q1, q0 @@ -64,10 +64,10 @@ entry: define <4 x i32> @vector_add_i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LE-LABEL: vector_add_i32: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vmov d0, r0, r1 ; CHECK-LE-NEXT: mov r0, sp ; CHECK-LE-NEXT: vldrw.u32 q1, [r0] +; CHECK-LE-NEXT: vmov d1, r2, r3 ; CHECK-LE-NEXT: vadd.i32 q0, q0, q1 ; CHECK-LE-NEXT: vmov r0, r1, d0 ; CHECK-LE-NEXT: vmov r2, r3, d1 @@ -75,9 +75,9 @@ define <4 x i32> @vector_add_i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; ; CHECK-BE-LABEL: vector_add_i32: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vmov d0, r1, r0 ; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0] ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 @@ -144,10 +144,10 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; CHECK-MVE-NEXT: push {r4, r5, r7, lr} ; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-MVE-NEXT: vmov d9, r2, r3 ; CHECK-MVE-NEXT: vmov d8, r0, r1 ; CHECK-MVE-NEXT: add r0, sp, #64 ; CHECK-MVE-NEXT: vldrw.u32 q6, [r0] +; CHECK-MVE-NEXT: vmov d9, r2, r3 ; CHECK-MVE-NEXT: vmov.u16 r4, q4[0] ; CHECK-MVE-NEXT: vmov.u16 r0, q6[0] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -239,13 +239,13 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; CHECK-BE-NEXT: push {r4, r5, r7, lr} ; CHECK-BE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-BE-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vmov d0, r1, r0 ; CHECK-BE-NEXT: add r0, sp, #64 ; CHECK-BE-NEXT: vldrh.u16 q6, [r0] +; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vrev64.16 q4, q0 -; CHECK-BE-NEXT: vmov.u16 r4, q4[0] ; CHECK-BE-NEXT: vmov.u16 r0, q6[0] +; CHECK-BE-NEXT: vmov.u16 r4, q4[0] ; CHECK-BE-NEXT: bl __aeabi_h2f ; CHECK-BE-NEXT: mov r5, r0 ; CHECK-BE-NEXT: mov r0, r4 @@ -332,10 +332,10 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; ; CHECK-FP-LABEL: vector_add_f16: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov d1, r2, r3 ; CHECK-FP-NEXT: vmov d0, r0, r1 ; CHECK-FP-NEXT: mov r0, sp ; CHECK-FP-NEXT: vldrw.u32 q1, [r0] +; CHECK-FP-NEXT: vmov d1, r2, r3 ; CHECK-FP-NEXT: vadd.f16 q0, q0, q1 ; CHECK-FP-NEXT: vmov r0, r1, d0 ; CHECK-FP-NEXT: vmov r2, r3, d1 @@ -352,21 +352,21 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) { ; CHECK-MVE-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-MVE-NEXT: .pad #4 ; CHECK-MVE-NEXT: sub sp, #4 -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: mov r4, r0 -; CHECK-MVE-NEXT: add r0, sp, #56 -; CHECK-MVE-NEXT: vldrw.u32 q5, [r0] +; CHECK-MVE-NEXT: add r0, sp, #40 +; CHECK-MVE-NEXT: vldrw.u32 q4, [r0] ; CHECK-MVE-NEXT: mov r6, r1 ; CHECK-MVE-NEXT: mov r0, r3 ; CHECK-MVE-NEXT: mov r5, r2 -; CHECK-MVE-NEXT: vmov r7, r1, d11 +; CHECK-MVE-NEXT: vmov r7, r1, d9 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: vmov s19, r0 ; CHECK-MVE-NEXT: mov r0, r5 ; CHECK-MVE-NEXT: mov r1, r7 ; CHECK-MVE-NEXT: bl __aeabi_fadd -; CHECK-MVE-NEXT: vmov r5, r1, d10 +; CHECK-MVE-NEXT: vmov r5, r1, d8 ; CHECK-MVE-NEXT: vmov s18, r0 ; CHECK-MVE-NEXT: mov r0, r6 ; CHECK-MVE-NEXT: bl __aeabi_fadd @@ -377,7 +377,7 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) { ; CHECK-MVE-NEXT: vmov s16, r0 ; CHECK-MVE-NEXT: vmov r2, r3, d9 ; CHECK-MVE-NEXT: vmov r0, r1, d8 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: add sp, #4 ; CHECK-MVE-NEXT: pop {r4, r5, r6, r7, pc} ; @@ -385,23 +385,23 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) { ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .save {r4, r5, r7, lr} ; CHECK-BE-NEXT: push {r4, r5, r7, lr} -; CHECK-BE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-BE-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-BE-NEXT: vmov d1, r3, r2 +; CHECK-BE-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-BE-NEXT: vpush {d8, d9, d10, d11} ; CHECK-BE-NEXT: vmov d0, r1, r0 -; CHECK-BE-NEXT: add r1, sp, #64 -; CHECK-BE-NEXT: vldrw.u32 q6, [r1] -; CHECK-BE-NEXT: vrev64.32 q5, q0 -; CHECK-BE-NEXT: vmov r4, r0, d11 -; CHECK-BE-NEXT: vmov r5, r1, d13 +; CHECK-BE-NEXT: add r1, sp, #48 +; CHECK-BE-NEXT: vldrw.u32 q5, [r1] +; CHECK-BE-NEXT: vmov d1, r3, r2 +; CHECK-BE-NEXT: vrev64.32 q4, q0 +; CHECK-BE-NEXT: vmov r4, r0, d9 +; CHECK-BE-NEXT: vmov r5, r1, d11 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: vmov s19, r0 ; CHECK-BE-NEXT: mov r0, r4 ; CHECK-BE-NEXT: mov r1, r5 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: vmov s18, r0 -; CHECK-BE-NEXT: vmov r4, r0, d10 -; CHECK-BE-NEXT: vmov r5, r1, d12 +; CHECK-BE-NEXT: vmov r4, r0, d8 +; CHECK-BE-NEXT: vmov r5, r1, d10 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: vmov s17, r0 ; CHECK-BE-NEXT: mov r0, r4 @@ -411,15 +411,15 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) { ; CHECK-BE-NEXT: vrev64.32 q0, q4 ; CHECK-BE-NEXT: vmov r1, r0, d0 ; CHECK-BE-NEXT: vmov r3, r2, d1 -; CHECK-BE-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-BE-NEXT: vpop {d8, d9, d10, d11} ; CHECK-BE-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-FP-LABEL: vector_add_f32: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmov d1, r2, r3 ; CHECK-FP-NEXT: vmov d0, r0, r1 ; CHECK-FP-NEXT: mov r0, sp ; CHECK-FP-NEXT: vldrw.u32 q1, [r0] +; CHECK-FP-NEXT: vmov d1, r2, r3 ; CHECK-FP-NEXT: vadd.f32 q0, q0, q1 ; CHECK-FP-NEXT: vmov r0, r1, d0 ; CHECK-FP-NEXT: vmov r2, r3, d1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll index df14b59f9934d..56f95b2218378 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -186,8 +186,8 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly % ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d9} +; CHECK-NEXT: vpush {d9} ; CHECK-NEXT: mov.w lr, #256 ; CHECK-NEXT: mov.w r12, #1 ; CHECK-NEXT: vmov.i32 q0, #0x0 @@ -195,12 +195,13 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly % ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov.f32 s12, s4 -; CHECK-NEXT: vmov.f32 s16, s8 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r5, s8 ; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: vmov.f32 s18, s9 -; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov.f32 s8, s10 ; CHECK-NEXT: vmov r7, s18 ; CHECK-NEXT: asrs r4, r3, #31 ; CHECK-NEXT: subs.w r8, r3, r5 @@ -209,24 +210,21 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly % ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: bfi r4, r5, #0, #4 ; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: subs.w r9, r5, r7 ; CHECK-NEXT: asr.w r6, r5, #31 -; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r5, s4 ; CHECK-NEXT: sbc.w r6, r6, r7, asr #31 ; CHECK-NEXT: and.w r6, r12, r6, asr #31 ; CHECK-NEXT: rsbs r6, r6, #0 ; CHECK-NEXT: bfi r4, r6, #4, #4 -; CHECK-NEXT: vmov r6, s14 +; CHECK-NEXT: vmov r6, s6 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: asrs r7, r6, #31 ; CHECK-NEXT: subs.w r10, r6, r3 -; CHECK-NEXT: asr.w r7, r6, #31 +; CHECK-NEXT: asr.w r6, r5, #31 ; CHECK-NEXT: sbc.w r3, r7, r3, asr #31 -; CHECK-NEXT: vmov r7, s4 -; CHECK-NEXT: asrs r6, r5, #31 +; CHECK-NEXT: vmov r7, s8 ; CHECK-NEXT: asr.w r11, r3, #31 ; CHECK-NEXT: and.w r3, r12, r3, asr #31 ; CHECK-NEXT: rsbs r3, r3, #0 @@ -247,7 +245,7 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly % ; CHECK-NEXT: vstrb.8 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: @@ -390,7 +388,13 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly % ; CHECK-NEXT: vmov.f32 s20, s12 ; CHECK-NEXT: vmov.f32 s22, s13 ; CHECK-NEXT: vand q5, q5, q0 +; CHECK-NEXT: vmov.f32 s8, s10 ; CHECK-NEXT: vmov r5, r6, d10 +; CHECK-NEXT: vmov.f32 s10, s11 +; CHECK-NEXT: vmov.f32 s12, s14 +; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: vmov.f32 s14, s15 +; CHECK-NEXT: vand q3, q3, q0 ; CHECK-NEXT: subs.w r8, r5, r3 ; CHECK-NEXT: vmov r7, r3, d11 ; CHECK-NEXT: sbc.w r4, r6, r4 @@ -398,12 +402,6 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly % ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: bfi r4, r5, #0, #4 ; CHECK-NEXT: vmov r5, r6, d9 -; CHECK-NEXT: vmov.f32 s16, s10 -; CHECK-NEXT: vmov.f32 s18, s11 -; CHECK-NEXT: vand q2, q4, q0 -; CHECK-NEXT: vmov.f32 s16, s14 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vand q3, q4, q0 ; CHECK-NEXT: subs.w r9, r7, r5 ; CHECK-NEXT: mov.w r7, #1 ; CHECK-NEXT: sbcs r3, r6 diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll index a94079d659216..43a3d1b049acf 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll @@ -759,8 +759,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oeq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -769,15 +769,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -785,76 +785,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16: @@ -871,8 +870,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_one_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -884,16 +883,16 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: vmovx.f16 s18, s12 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -901,11 +900,10 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -913,11 +911,11 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -925,14 +923,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -940,11 +939,11 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -952,14 +951,14 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -967,10 +966,10 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 @@ -979,10 +978,9 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_one_v8f16: @@ -1000,8 +998,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ogt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1010,15 +1008,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1026,76 +1024,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16: @@ -1112,8 +1109,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1122,15 +1119,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1138,76 +1135,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16: @@ -1224,8 +1220,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_olt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1234,15 +1230,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1250,76 +1246,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16: @@ -1336,8 +1331,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ole_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1346,15 +1341,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1362,76 +1357,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16: @@ -1448,8 +1442,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ueq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1461,16 +1455,16 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: vmovx.f16 s18, s12 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1478,11 +1472,10 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1490,11 +1483,11 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1502,14 +1495,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1517,11 +1511,11 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1529,14 +1523,14 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1544,10 +1538,10 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 @@ -1556,10 +1550,9 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16: @@ -1577,8 +1570,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_une_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1587,15 +1580,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1603,76 +1596,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_une_v8f16: @@ -1689,8 +1681,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ugt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1699,15 +1691,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1715,76 +1707,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16: @@ -1801,8 +1792,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_uge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1811,15 +1802,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1827,76 +1818,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16: @@ -1913,8 +1903,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ult_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -1923,15 +1913,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -1939,76 +1929,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16: @@ -2025,8 +2014,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ule_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -2035,15 +2024,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -2051,76 +2040,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16: @@ -2137,8 +2125,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ord_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -2147,15 +2135,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -2163,76 +2151,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16: @@ -2250,8 +2237,8 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_uno_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11} +; CHECK-MVE-NEXT: .vsave {d8, d9} +; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s16, s4 ; CHECK-MVE-NEXT: vmovx.f16 s18, s0 ; CHECK-MVE-NEXT: vcmp.f16 s18, s16 @@ -2260,15 +2247,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s8 +; CHECK-MVE-NEXT: vcmp.f16 s0, s4 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vmovx.f16 s16, s8 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s0, s4 -; CHECK-MVE-NEXT: vseleq.f16 s20, s18, s16 +; CHECK-MVE-NEXT: vmovx.f16 s18, s12 +; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s1 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 @@ -2276,76 +2263,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %s ; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: vins.f16 s16, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s5 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s9 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s1 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s9 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s13 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s1, s5 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s2 +; CHECK-MVE-NEXT: vmovx.f16 s8, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s6 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s10 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s0, s16 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 +; CHECK-MVE-NEXT: vcmp.f16 s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s10 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s14 -; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s2, s6 +; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s22, s3 +; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s7 -; CHECK-MVE-NEXT: vcmp.f16 s22, s20 -; CHECK-MVE-NEXT: vmovx.f16 s20, s11 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 +; CHECK-MVE-NEXT: vcmp.f16 s6, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s11 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vmovx.f16 s22, s15 +; CHECK-MVE-NEXT: vmovx.f16 s6, s15 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s7 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s20, s22, s20 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s20 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s4 +; CHECK-MVE-NEXT: vpop {d8, d9} ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16: diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll index e47207594a7ac..d90688d43d6e0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll @@ -801,8 +801,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oeq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -822,12 +820,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -839,17 +837,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -861,17 +859,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -882,17 +880,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16: @@ -912,8 +908,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_one_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -937,12 +931,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -956,19 +950,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s1, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -982,19 +976,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s2, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1007,7 +1001,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 @@ -1016,10 +1010,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_one_v8f16: @@ -1040,8 +1032,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ogt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1061,12 +1051,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1078,17 +1068,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1100,17 +1090,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1121,17 +1111,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16: @@ -1151,8 +1139,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1172,12 +1158,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1189,17 +1175,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1211,17 +1197,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1232,17 +1218,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16: @@ -1262,8 +1246,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_olt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1283,12 +1265,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1300,17 +1282,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1322,17 +1304,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1343,17 +1325,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16: @@ -1373,8 +1353,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ole_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1394,12 +1372,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1411,17 +1389,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1433,17 +1411,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1454,17 +1432,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16: @@ -1484,8 +1460,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ueq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1509,12 +1483,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1528,19 +1502,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s1, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1554,19 +1528,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s2, s4 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1579,7 +1553,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 @@ -1588,10 +1562,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16: @@ -1612,8 +1584,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_une_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1633,12 +1603,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1650,17 +1620,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1672,17 +1642,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1693,17 +1663,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_une_v8f16: @@ -1723,8 +1691,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ugt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1744,12 +1710,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1761,17 +1727,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1783,17 +1749,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1804,17 +1770,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16: @@ -1834,8 +1798,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_uge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1855,12 +1817,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1872,17 +1834,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -1894,17 +1856,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -1915,17 +1877,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16: @@ -1945,8 +1905,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ult_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -1966,12 +1924,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -1983,17 +1941,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -2005,17 +1963,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -2026,17 +1984,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16: @@ -2056,8 +2012,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ule_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -2077,12 +2031,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -2094,17 +2048,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -2116,17 +2070,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -2137,17 +2091,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16: @@ -2167,8 +2119,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ord_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -2188,12 +2138,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -2205,17 +2155,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -2227,17 +2177,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -2248,17 +2198,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16: @@ -2279,8 +2227,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_uno_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -2300,12 +2246,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -2317,17 +2263,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -2339,17 +2285,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -2360,17 +2306,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, < ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16: @@ -3190,8 +3134,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_oeq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3211,12 +3153,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3228,17 +3170,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3250,17 +3192,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3271,17 +3213,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_oeq_v8f16: @@ -3301,8 +3241,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_one_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3326,12 +3264,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3345,19 +3283,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3371,19 +3309,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3396,7 +3334,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 @@ -3405,10 +3343,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_one_v8f16: @@ -3429,8 +3365,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ogt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3450,12 +3384,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3467,17 +3401,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3489,17 +3423,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3510,17 +3444,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ogt_v8f16: @@ -3540,8 +3472,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_oge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3561,12 +3491,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3578,17 +3508,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3600,17 +3530,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3621,17 +3551,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_oge_v8f16: @@ -3651,8 +3579,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_olt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3672,12 +3598,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3689,17 +3615,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3711,17 +3637,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3732,17 +3658,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_olt_v8f16: @@ -3762,8 +3686,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ole_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3783,12 +3705,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3800,17 +3722,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3822,17 +3744,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3843,17 +3765,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ole_v8f16: @@ -3873,8 +3793,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ueq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -3898,12 +3816,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -3917,19 +3835,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -3943,19 +3861,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s2 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -3968,7 +3886,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 @@ -3977,10 +3895,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ueq_v8f16: @@ -4001,8 +3917,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_une_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4022,12 +3936,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4039,17 +3953,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4061,17 +3975,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4082,17 +3996,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_une_v8f16: @@ -4112,8 +4024,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ugt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4133,12 +4043,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4150,17 +4060,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4172,17 +4082,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4193,17 +4103,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ugt_v8f16: @@ -4223,8 +4131,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_uge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4244,12 +4150,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4261,17 +4167,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4283,17 +4189,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4304,17 +4210,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_uge_v8f16: @@ -4334,8 +4238,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ult_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4355,12 +4257,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4372,17 +4274,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4394,17 +4296,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4415,17 +4317,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ult_v8f16: @@ -4445,8 +4345,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ule_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4466,12 +4364,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4483,17 +4381,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4505,17 +4403,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4526,17 +4424,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ule_v8f16: @@ -4556,8 +4452,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ord_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4577,12 +4471,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4594,17 +4488,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4616,17 +4510,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4637,17 +4531,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16: @@ -4668,8 +4560,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_uno_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 @@ -4689,12 +4579,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4706,17 +4596,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4728,17 +4618,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s4, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4749,17 +4639,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2, ; CHECK-MVE-NEXT: vcmp.f16 s4, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16: @@ -4782,8 +4670,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oeq_v8f16_bc: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s6, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 @@ -4803,12 +4689,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2 ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s13 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s12, s8 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s0, s12, s8 +; CHECK-MVE-NEXT: vmovx.f16 s8, s13 +; CHECK-MVE-NEXT: vins.f16 s0, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s1 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s9 @@ -4820,17 +4706,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s14 +; CHECK-MVE-NEXT: vmovx.f16 s8, s14 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s17, s13, s9 -; CHECK-MVE-NEXT: vins.f16 s17, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s13, s9 +; CHECK-MVE-NEXT: vins.f16 s1, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s10 @@ -4842,17 +4728,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s5, s15 +; CHECK-MVE-NEXT: vmovx.f16 s8, s15 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s18, s14, s10 -; CHECK-MVE-NEXT: vins.f16 s18, s6 +; CHECK-MVE-NEXT: vseleq.f16 s2, s14, s10 +; CHECK-MVE-NEXT: vins.f16 s2, s6 ; CHECK-MVE-NEXT: vmovx.f16 s6, s3 ; CHECK-MVE-NEXT: vcmp.f16 s6, s4 ; CHECK-MVE-NEXT: vmovx.f16 s6, s11 @@ -4863,17 +4749,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2 ; CHECK-MVE-NEXT: vcmp.f16 s3, s4 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s6, s5, s6 +; CHECK-MVE-NEXT: vseleq.f16 s6, s8, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s19, s15, s11 -; CHECK-MVE-NEXT: vins.f16 s19, s6 -; CHECK-MVE-NEXT: vmov q0, q4 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s15, s11 +; CHECK-MVE-NEXT: vins.f16 s3, s6 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16_bc: diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll index eee5c5b249e1a..33231783c5e69 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll @@ -759,8 +759,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oeq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -774,43 +772,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -819,20 +817,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -840,17 +838,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16: @@ -867,8 +863,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_one_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -884,7 +878,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -892,15 +886,13 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -908,25 +900,27 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -937,22 +931,22 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s2, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -962,7 +956,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 @@ -971,10 +965,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_one_v8f16: @@ -992,8 +984,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ogt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1007,43 +997,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 @@ -1052,20 +1042,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 @@ -1073,17 +1063,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16: @@ -1100,8 +1088,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_oge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1115,43 +1101,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 @@ -1160,20 +1146,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 @@ -1181,17 +1167,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16: @@ -1208,8 +1192,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_olt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1223,43 +1205,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -1268,20 +1250,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -1289,17 +1271,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16: @@ -1316,8 +1296,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ole_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1331,43 +1309,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 @@ -1376,20 +1354,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 @@ -1397,17 +1375,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16: @@ -1424,8 +1400,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ueq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1441,7 +1415,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1449,15 +1423,13 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1465,25 +1437,27 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1494,22 +1468,22 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s2, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -1519,7 +1493,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 @@ -1528,10 +1502,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16: @@ -1549,8 +1521,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_une_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1564,43 +1534,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 @@ -1609,20 +1579,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 @@ -1630,17 +1600,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_une_v8f16: @@ -1657,8 +1625,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ugt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1672,43 +1638,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 @@ -1717,20 +1683,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 @@ -1738,17 +1704,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16: @@ -1765,8 +1729,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_uge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1780,43 +1742,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 @@ -1825,20 +1787,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 @@ -1846,17 +1808,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16: @@ -1873,8 +1833,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ult_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1888,43 +1846,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 @@ -1933,20 +1891,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 @@ -1954,17 +1912,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16: @@ -1981,8 +1937,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ule_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -1996,43 +1950,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 @@ -2041,20 +1995,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 @@ -2062,17 +2016,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16: @@ -2089,8 +2041,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_ord_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, s12 @@ -2104,43 +2054,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s0, s0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, s1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 @@ -2149,20 +2099,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 @@ -2170,17 +2120,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s3, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16: @@ -2198,8 +2146,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_uno_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, s12 @@ -2213,43 +2159,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s0, s0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, s1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 @@ -2258,20 +2204,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 @@ -2279,17 +2225,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a ; CHECK-MVE-NEXT: vcmp.f16 s3, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16: @@ -3064,8 +3008,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_oeq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3079,43 +3021,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -3124,20 +3066,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -3145,17 +3087,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_oeq_v8f16: @@ -3172,8 +3112,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_one_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3189,7 +3127,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -3197,15 +3135,13 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -3213,25 +3149,27 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -3242,22 +3180,22 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s2, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -3267,7 +3205,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 @@ -3276,10 +3214,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_one_v8f16: @@ -3297,8 +3233,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ogt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3312,43 +3246,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -3357,20 +3291,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r1, #1 @@ -3378,17 +3312,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it mi ; CHECK-MVE-NEXT: movmi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ogt_v8f16: @@ -3405,8 +3337,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_oge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3420,43 +3350,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 @@ -3465,20 +3395,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r1, #1 @@ -3486,17 +3416,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ls ; CHECK-MVE-NEXT: movls r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_oge_v8f16: @@ -3513,8 +3441,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_olt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3528,43 +3454,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 @@ -3573,20 +3499,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r1, #1 @@ -3594,17 +3520,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it gt ; CHECK-MVE-NEXT: movgt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_olt_v8f16: @@ -3621,8 +3545,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ole_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3636,43 +3558,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 @@ -3681,20 +3603,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r1, #1 @@ -3702,17 +3624,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ge ; CHECK-MVE-NEXT: movge r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ole_v8f16: @@ -3729,8 +3649,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ueq_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3746,7 +3664,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -3754,15 +3672,13 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -3770,25 +3686,27 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cmp r1, #0 +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -3799,22 +3717,22 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s2, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r1, #1 @@ -3824,7 +3742,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it eq ; CHECK-MVE-NEXT: moveq r0, #1 @@ -3833,10 +3751,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ueq_v8f16: @@ -3854,8 +3770,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_une_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3869,43 +3783,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 @@ -3914,20 +3828,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r1, #1 @@ -3935,17 +3849,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it ne ; CHECK-MVE-NEXT: movne r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_une_v8f16: @@ -3962,8 +3874,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ugt_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -3977,43 +3887,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 @@ -4022,20 +3932,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r1, #1 @@ -4043,17 +3953,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it lt ; CHECK-MVE-NEXT: movlt r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ugt_v8f16: @@ -4070,8 +3978,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_uge_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -4085,43 +3991,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 @@ -4130,20 +4036,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r1, #1 @@ -4151,17 +4057,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it le ; CHECK-MVE-NEXT: movle r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_uge_v8f16: @@ -4178,8 +4082,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ult_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -4193,43 +4095,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 @@ -4238,20 +4140,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r1, #1 @@ -4259,17 +4161,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it hi ; CHECK-MVE-NEXT: movhi r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ult_v8f16: @@ -4286,8 +4186,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ule_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, #0 @@ -4301,43 +4199,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, #0 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 @@ -4346,20 +4244,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, #0 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, #0 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r1, #1 @@ -4367,17 +4265,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s3, #0 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it pl ; CHECK-MVE-NEXT: movpl r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ule_v8f16: @@ -4394,8 +4290,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_ord_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, s12 @@ -4409,43 +4303,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s0, s0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, s1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 @@ -4454,20 +4348,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r1, #1 @@ -4475,17 +4369,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s3, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vc ; CHECK-MVE-NEXT: movvc r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16: @@ -4503,8 +4395,6 @@ entry: define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) { ; CHECK-MVE-LABEL: vcmp_r_uno_v8f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .vsave {d8, d9} -; CHECK-MVE-NEXT: vpush {d8, d9} ; CHECK-MVE-NEXT: vmovx.f16 s12, s0 ; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: vcmp.f16 s12, s12 @@ -4518,43 +4408,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s0, s0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s14, s12 +; CHECK-MVE-NEXT: vseleq.f16 s12, s14, s12 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s9 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s12, s8, s4 -; CHECK-MVE-NEXT: movs r0, #0 -; CHECK-MVE-NEXT: vins.f16 s12, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s1 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s5 +; CHECK-MVE-NEXT: vseleq.f16 s0, s8, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s1 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s5 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vcmp.f16 s1, s1 +; CHECK-MVE-NEXT: vmovx.f16 s8, s9 ; CHECK-MVE-NEXT: cset r1, ne +; CHECK-MVE-NEXT: vcmp.f16 s1, s1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s10 +; CHECK-MVE-NEXT: vmovx.f16 s8, s10 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s13, s9, s5 -; CHECK-MVE-NEXT: vins.f16 s13, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s2 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s6 +; CHECK-MVE-NEXT: vseleq.f16 s1, s9, s5 +; CHECK-MVE-NEXT: vins.f16 s0, s12 +; CHECK-MVE-NEXT: vins.f16 s1, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s2 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s6 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 @@ -4563,20 +4453,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s8, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: cset r1, ne -; CHECK-MVE-NEXT: vmovx.f16 s18, s11 ; CHECK-MVE-NEXT: cmp r1, #0 ; CHECK-MVE-NEXT: mov.w r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s14, s10, s6 -; CHECK-MVE-NEXT: vins.f16 s14, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s3 -; CHECK-MVE-NEXT: vcmp.f16 s16, s16 -; CHECK-MVE-NEXT: vmovx.f16 s16, s7 +; CHECK-MVE-NEXT: vseleq.f16 s2, s10, s6 +; CHECK-MVE-NEXT: vmovx.f16 s6, s11 +; CHECK-MVE-NEXT: vins.f16 s2, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s3 +; CHECK-MVE-NEXT: vcmp.f16 s4, s4 +; CHECK-MVE-NEXT: vmovx.f16 s4, s7 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r1, #1 @@ -4584,17 +4474,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half> ; CHECK-MVE-NEXT: vcmp.f16 s3, s3 ; CHECK-MVE-NEXT: cset r1, ne ; CHECK-MVE-NEXT: cmp r1, #0 -; CHECK-MVE-NEXT: vseleq.f16 s16, s18, s16 +; CHECK-MVE-NEXT: vseleq.f16 s4, s6, s4 ; CHECK-MVE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-MVE-NEXT: it vs ; CHECK-MVE-NEXT: movvs r0, #1 ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: cset r0, ne ; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: vseleq.f16 s15, s11, s7 -; CHECK-MVE-NEXT: vins.f16 s15, s16 -; CHECK-MVE-NEXT: vmov q0, q3 -; CHECK-MVE-NEXT: vpop {d8, d9} +; CHECK-MVE-NEXT: vseleq.f16 s3, s11, s7 +; CHECK-MVE-NEXT: vins.f16 s3, s4 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16: diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll index 9b1175fabce3b..84a9e0145f0c7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll @@ -5,11 +5,10 @@ define arm_aapcs_vfpcc <4 x float> @foo_float_int32(<4 x i32> %src) { ; CHECK-MVE-LABEL: foo_float_int32: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcvt.f32.s32 s7, s3 -; CHECK-MVE-NEXT: vcvt.f32.s32 s6, s2 -; CHECK-MVE-NEXT: vcvt.f32.s32 s5, s1 -; CHECK-MVE-NEXT: vcvt.f32.s32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vcvt.f32.s32 s3, s3 +; CHECK-MVE-NEXT: vcvt.f32.s32 s2, s2 +; CHECK-MVE-NEXT: vcvt.f32.s32 s1, s1 +; CHECK-MVE-NEXT: vcvt.f32.s32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: foo_float_int32: @@ -24,11 +23,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @foo_float_uint32(<4 x i32> %src) { ; CHECK-MVE-LABEL: foo_float_uint32: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcvt.f32.u32 s7, s3 -; CHECK-MVE-NEXT: vcvt.f32.u32 s6, s2 -; CHECK-MVE-NEXT: vcvt.f32.u32 s5, s1 -; CHECK-MVE-NEXT: vcvt.f32.u32 s4, s0 -; CHECK-MVE-NEXT: vmov q0, q1 +; CHECK-MVE-NEXT: vcvt.f32.u32 s3, s3 +; CHECK-MVE-NEXT: vcvt.f32.u32 s2, s2 +; CHECK-MVE-NEXT: vcvt.f32.u32 s1, s1 +; CHECK-MVE-NEXT: vcvt.f32.u32 s0, s0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: foo_float_uint32: @@ -43,15 +41,15 @@ entry: define arm_aapcs_vfpcc <4 x i32> @foo_int32_float(<4 x float> %src) { ; CHECK-MVE-LABEL: foo_int32_float: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcvt.s32.f32 s4, s2 -; CHECK-MVE-NEXT: vcvt.s32.f32 s6, s0 -; CHECK-MVE-NEXT: vcvt.s32.f32 s8, s3 -; CHECK-MVE-NEXT: vcvt.s32.f32 s10, s1 +; CHECK-MVE-NEXT: vcvt.s32.f32 s2, s2 +; CHECK-MVE-NEXT: vcvt.s32.f32 s0, s0 +; CHECK-MVE-NEXT: vcvt.s32.f32 s4, s3 +; CHECK-MVE-NEXT: vcvt.s32.f32 s6, s1 +; CHECK-MVE-NEXT: vmov r0, s2 +; CHECK-MVE-NEXT: vmov r1, s0 +; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0 ; CHECK-MVE-NEXT: vmov r0, s4 ; CHECK-MVE-NEXT: vmov r1, s6 -; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-MVE-NEXT: vmov r0, s8 -; CHECK-MVE-NEXT: vmov r1, s10 ; CHECK-MVE-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-MVE-NEXT: bx lr ; @@ -67,15 +65,15 @@ entry: define arm_aapcs_vfpcc <4 x i32> @foo_uint32_float(<4 x float> %src) { ; CHECK-MVE-LABEL: foo_uint32_float: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vcvt.u32.f32 s4, s2 -; CHECK-MVE-NEXT: vcvt.u32.f32 s6, s0 -; CHECK-MVE-NEXT: vcvt.u32.f32 s8, s3 -; CHECK-MVE-NEXT: vcvt.u32.f32 s10, s1 +; CHECK-MVE-NEXT: vcvt.u32.f32 s2, s2 +; CHECK-MVE-NEXT: vcvt.u32.f32 s0, s0 +; CHECK-MVE-NEXT: vcvt.u32.f32 s4, s3 +; CHECK-MVE-NEXT: vcvt.u32.f32 s6, s1 +; CHECK-MVE-NEXT: vmov r0, s2 +; CHECK-MVE-NEXT: vmov r1, s0 +; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0 ; CHECK-MVE-NEXT: vmov r0, s4 ; CHECK-MVE-NEXT: vmov r1, s6 -; CHECK-MVE-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-MVE-NEXT: vmov r0, s8 -; CHECK-MVE-NEXT: vmov r1, s10 ; CHECK-MVE-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-MVE-NEXT: bx lr ; @@ -96,28 +94,28 @@ define arm_aapcs_vfpcc <8 x half> @foo_half_int16(<8 x i16> %src) { ; CHECK-MVE-NEXT: vmov s0, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[1] ; CHECK-MVE-NEXT: vmov s2, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[3] -; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s2 ; CHECK-MVE-NEXT: vcvt.f16.s32 s0, s0 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmov s8, r0 +; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s2 +; CHECK-MVE-NEXT: vmov.s16 r0, q1[3] +; CHECK-MVE-NEXT: vins.f16 s0, s2 +; CHECK-MVE-NEXT: vmov s2, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[2] -; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8 -; CHECK-MVE-NEXT: vmov s10, r0 -; CHECK-MVE-NEXT: vmov.s16 r0, q1[4] -; CHECK-MVE-NEXT: vcvt.f16.s32 s1, s10 -; CHECK-MVE-NEXT: vins.f16 s1, s8 +; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s2 ; CHECK-MVE-NEXT: vmov s8, r0 +; CHECK-MVE-NEXT: vmov.s16 r0, q1[4] +; CHECK-MVE-NEXT: vcvt.f16.s32 s1, s8 +; CHECK-MVE-NEXT: vins.f16 s1, s2 +; CHECK-MVE-NEXT: vmov s2, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[5] -; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s8 -; CHECK-MVE-NEXT: vmov s10, r0 +; CHECK-MVE-NEXT: vcvt.f16.s32 s2, s2 +; CHECK-MVE-NEXT: vmov s8, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[7] +; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8 +; CHECK-MVE-NEXT: vins.f16 s2, s8 ; CHECK-MVE-NEXT: vmov s8, r0 ; CHECK-MVE-NEXT: vmov.s16 r0, q1[6] -; CHECK-MVE-NEXT: vcvt.f16.s32 s10, s10 -; CHECK-MVE-NEXT: vmov s4, r0 -; CHECK-MVE-NEXT: vins.f16 s2, s10 ; CHECK-MVE-NEXT: vcvt.f16.s32 s8, s8 +; CHECK-MVE-NEXT: vmov s4, r0 ; CHECK-MVE-NEXT: vcvt.f16.s32 s3, s4 ; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr @@ -139,28 +137,28 @@ define arm_aapcs_vfpcc <8 x half> @foo_half_uint16(<8 x i16> %src) { ; CHECK-MVE-NEXT: vmov s0, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[1] ; CHECK-MVE-NEXT: vmov s2, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[3] -; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s2 ; CHECK-MVE-NEXT: vcvt.f16.u32 s0, s0 -; CHECK-MVE-NEXT: vins.f16 s0, s8 -; CHECK-MVE-NEXT: vmov s8, r0 +; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s2 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[3] +; CHECK-MVE-NEXT: vins.f16 s0, s2 +; CHECK-MVE-NEXT: vmov s2, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[2] -; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8 -; CHECK-MVE-NEXT: vmov s10, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[4] -; CHECK-MVE-NEXT: vcvt.f16.u32 s1, s10 -; CHECK-MVE-NEXT: vins.f16 s1, s8 +; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s2 ; CHECK-MVE-NEXT: vmov s8, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[4] +; CHECK-MVE-NEXT: vcvt.f16.u32 s1, s8 +; CHECK-MVE-NEXT: vins.f16 s1, s2 +; CHECK-MVE-NEXT: vmov s2, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[5] -; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s8 -; CHECK-MVE-NEXT: vmov s10, r0 +; CHECK-MVE-NEXT: vcvt.f16.u32 s2, s2 +; CHECK-MVE-NEXT: vmov s8, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[7] +; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8 +; CHECK-MVE-NEXT: vins.f16 s2, s8 ; CHECK-MVE-NEXT: vmov s8, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q1[6] -; CHECK-MVE-NEXT: vcvt.f16.u32 s10, s10 -; CHECK-MVE-NEXT: vmov s4, r0 -; CHECK-MVE-NEXT: vins.f16 s2, s10 ; CHECK-MVE-NEXT: vcvt.f16.u32 s8, s8 +; CHECK-MVE-NEXT: vmov s4, r0 ; CHECK-MVE-NEXT: vcvt.f16.u32 s3, s4 ; CHECK-MVE-NEXT: vins.f16 s3, s8 ; CHECK-MVE-NEXT: bx lr @@ -177,15 +175,15 @@ entry: define arm_aapcs_vfpcc <8 x i16> @foo_int16_half(<8 x half> %src) { ; CHECK-MVE-LABEL: foo_int16_half: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmovx.f16 s14, s0 +; CHECK-MVE-NEXT: vmovx.f16 s6, s2 +; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2 +; CHECK-MVE-NEXT: vmovx.f16 s2, s0 ; CHECK-MVE-NEXT: vcvt.s32.f16 s0, s0 -; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s14 +; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s2 ; CHECK-MVE-NEXT: vmov r0, s0 ; CHECK-MVE-NEXT: vmovx.f16 s4, s3 -; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vmovx.f16 s10, s1 ; CHECK-MVE-NEXT: vcvt.s32.f16 s8, s3 -; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2 ; CHECK-MVE-NEXT: vcvt.s32.f16 s5, s1 ; CHECK-MVE-NEXT: vmov.16 q0[0], r0 ; CHECK-MVE-NEXT: vmov r0, s14 @@ -219,15 +217,15 @@ entry: define arm_aapcs_vfpcc <8 x i16> @foo_uint16_half(<8 x half> %src) { ; CHECK-MVE-LABEL: foo_uint16_half: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmovx.f16 s14, s0 +; CHECK-MVE-NEXT: vmovx.f16 s6, s2 +; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2 +; CHECK-MVE-NEXT: vmovx.f16 s2, s0 ; CHECK-MVE-NEXT: vcvt.s32.f16 s0, s0 -; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s14 +; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s2 ; CHECK-MVE-NEXT: vmov r0, s0 ; CHECK-MVE-NEXT: vmovx.f16 s4, s3 -; CHECK-MVE-NEXT: vmovx.f16 s6, s2 ; CHECK-MVE-NEXT: vmovx.f16 s10, s1 ; CHECK-MVE-NEXT: vcvt.s32.f16 s8, s3 -; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2 ; CHECK-MVE-NEXT: vcvt.s32.f16 s5, s1 ; CHECK-MVE-NEXT: vmov.16 q0[0], r0 ; CHECK-MVE-NEXT: vmov r0, s14 @@ -355,14 +353,13 @@ entry: define arm_aapcs_vfpcc <8 x half> @vmovn32_trunc1(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: vmovn32_trunc1: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s8 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s2 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s4 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s9 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s1 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s3 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s5 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s10 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s6 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s11 ; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s7 ; CHECK-MVE-NEXT: bx lr ; @@ -380,15 +377,14 @@ entry: define arm_aapcs_vfpcc <8 x half> @vmovn32_trunc2(<4 x float> %src1, <4 x float> %src2) { ; CHECK-MVE-LABEL: vmovn32_trunc2: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: vmov q2, q0 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s4 -; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s8 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s1, s5 -; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s9 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s2, s6 -; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s10 -; CHECK-MVE-NEXT: vcvtb.f16.f32 s3, s7 -; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s11 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s1, s1 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s2, s2 +; CHECK-MVE-NEXT: vcvtt.f16.f32 s3, s3 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s4 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s5 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s6 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s4, s7 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-MVEFP-LABEL: vmovn32_trunc2: diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll index 98ee5fdd3f34e..844e39e2964bb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll @@ -4,11 +4,10 @@ define arm_aapcs_vfpcc <4 x float> @fpext_4(<4 x half> %src1) { ; CHECK-LABEL: fpext_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtt.f32.f16 s7, s1 -; CHECK-NEXT: vcvtb.f32.f16 s6, s1 -; CHECK-NEXT: vcvtt.f32.f16 s5, s0 -; CHECK-NEXT: vcvtb.f32.f16 s4, s0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vcvtt.f32.f16 s3, s1 +; CHECK-NEXT: vcvtb.f32.f16 s2, s1 +; CHECK-NEXT: vcvtt.f32.f16 s1, s0 +; CHECK-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-NEXT: bx lr entry: %out = fpext <4 x half> %src1 to <4 x float> @@ -19,12 +18,12 @@ define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) { ; CHECK-LABEL: fpext_8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vcvtt.f32.f16 s11, s1 -; CHECK-NEXT: vcvtt.f32.f16 s7, s3 ; CHECK-NEXT: vcvtb.f32.f16 s10, s1 -; CHECK-NEXT: vcvtb.f32.f16 s6, s3 ; CHECK-NEXT: vcvtt.f32.f16 s9, s0 -; CHECK-NEXT: vcvtt.f32.f16 s5, s2 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 +; CHECK-NEXT: vcvtt.f32.f16 s7, s3 +; CHECK-NEXT: vcvtb.f32.f16 s6, s3 +; CHECK-NEXT: vcvtt.f32.f16 s5, s2 ; CHECK-NEXT: vcvtb.f32.f16 s4, s2 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr @@ -37,11 +36,10 @@ entry: define arm_aapcs_vfpcc <4 x half> @fptrunc_4(<4 x float> %src1) { ; CHECK-LABEL: fptrunc_4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 s4, s0 -; CHECK-NEXT: vcvtt.f16.f32 s4, s1 -; CHECK-NEXT: vcvtb.f16.f32 s5, s2 -; CHECK-NEXT: vcvtt.f16.f32 s5, s3 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s2 +; CHECK-NEXT: vcvtt.f16.f32 s0, s1 +; CHECK-NEXT: vcvtt.f16.f32 s1, s3 ; CHECK-NEXT: bx lr entry: %out = fptrunc <4 x float> %src1 to <4 x half> @@ -51,15 +49,14 @@ entry: define arm_aapcs_vfpcc <8 x half> @fptrunc_8(<8 x float> %src1) { ; CHECK-LABEL: fptrunc_8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vcvtb.f16.f32 s0, s8 -; CHECK-NEXT: vcvtt.f16.f32 s0, s9 -; CHECK-NEXT: vcvtb.f16.f32 s1, s10 -; CHECK-NEXT: vcvtt.f16.f32 s1, s11 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: vcvtb.f16.f32 s2, s2 ; CHECK-NEXT: vcvtb.f16.f32 s2, s4 +; CHECK-NEXT: vcvtt.f16.f32 s0, s1 +; CHECK-NEXT: vcvtt.f16.f32 s1, s3 ; CHECK-NEXT: vcvtt.f16.f32 s2, s5 -; CHECK-NEXT: vcvtb.f16.f32 s3, s6 ; CHECK-NEXT: vcvtt.f16.f32 s3, s7 +; CHECK-NEXT: vcvtb.f16.f32 s4, s6 ; CHECK-NEXT: bx lr entry: %out = fptrunc <8 x float> %src1 to <8 x half> @@ -247,12 +244,12 @@ define arm_aapcs_vfpcc <8 x float> @load_shuffleext_16(<16 x half>* %src) { ; CHECK-NEXT: vld20.16 {q2, q3}, [r0] ; CHECK-NEXT: vld21.16 {q2, q3}, [r0] ; CHECK-NEXT: vcvtt.f32.f16 s3, s9 -; CHECK-NEXT: vcvtt.f32.f16 s7, s11 ; CHECK-NEXT: vcvtb.f32.f16 s2, s9 -; CHECK-NEXT: vcvtb.f32.f16 s6, s11 ; CHECK-NEXT: vcvtt.f32.f16 s1, s8 -; CHECK-NEXT: vcvtt.f32.f16 s5, s10 ; CHECK-NEXT: vcvtb.f32.f16 s0, s8 +; CHECK-NEXT: vcvtt.f32.f16 s7, s11 +; CHECK-NEXT: vcvtb.f32.f16 s6, s11 +; CHECK-NEXT: vcvtt.f32.f16 s5, s10 ; CHECK-NEXT: vcvtb.f32.f16 s4, s10 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll index 0f7393c85d670..f444ec4ef1e94 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll @@ -136,7 +136,6 @@ entry: define arm_aapcs_vfpcc <2 x double> @vdup_f64(double %src) { ; CHECK-LABEL: vdup_f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: vmov.f32 s2, s0 ; CHECK-NEXT: vmov.f32 s3, s1 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll index c96baf10bc607..bd2aa4be5fab7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll @@ -16,17 +16,17 @@ entry: define arm_aapcs_vfpcc float @fadd_v4f32(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fadd_v4f32: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vadd.f32 s6, s2, s3 +; CHECK-FP-NEXT: vadd.f32 s2, s2, s3 ; CHECK-FP-NEXT: vadd.f32 s0, s0, s1 -; CHECK-FP-NEXT: vadd.f32 s0, s0, s6 +; CHECK-FP-NEXT: vadd.f32 s0, s0, s2 ; CHECK-FP-NEXT: vadd.f32 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fadd_v4f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vadd.f32 s6, s0, s1 -; CHECK-NOFP-NEXT: vadd.f32 s6, s6, s2 -; CHECK-NOFP-NEXT: vadd.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: vadd.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -38,21 +38,21 @@ define arm_aapcs_vfpcc float @fadd_v8f32(<8 x float> %x, float %y) { ; CHECK-FP-LABEL: fadd_v8f32: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vadd.f32 q0, q0, q1 -; CHECK-FP-NEXT: vadd.f32 s4, s2, s3 +; CHECK-FP-NEXT: vadd.f32 s2, s2, s3 ; CHECK-FP-NEXT: vadd.f32 s0, s0, s1 -; CHECK-FP-NEXT: vadd.f32 s0, s0, s4 +; CHECK-FP-NEXT: vadd.f32 s0, s0, s2 ; CHECK-FP-NEXT: vadd.f32 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fadd_v8f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vadd.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vadd.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vadd.f32 s14, s2, s6 -; CHECK-NOFP-NEXT: vadd.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vadd.f32 s10, s12, s10 -; CHECK-NOFP-NEXT: vadd.f32 s2, s10, s14 -; CHECK-NOFP-NEXT: vadd.f32 s0, s2, s0 +; CHECK-NOFP-NEXT: vadd.f32 s2, s2, s6 +; CHECK-NOFP-NEXT: vadd.f32 s4, s3, s7 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s10 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vadd.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -63,8 +63,8 @@ entry: define arm_aapcs_vfpcc half @fadd_v2f16(<2 x half> %x, half %y) { ; CHECK-LABEL: fadd_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vadd.f16 s0, s0, s6 +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NEXT: vadd.f16 s0, s4, s0 ; CHECK-NEXT: bx lr entry: @@ -75,21 +75,21 @@ entry: define arm_aapcs_vfpcc half @fadd_v4f16(<4 x half> %x, half %y) { ; CHECK-FP-LABEL: fadd_v4f16: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmovx.f16 s8, s0 -; CHECK-FP-NEXT: vadd.f16 s6, s1, s6 -; CHECK-FP-NEXT: vadd.f16 s0, s0, s8 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vadd.f16 s2, s1, s2 ; CHECK-FP-NEXT: vadd.f16 s0, s0, s6 +; CHECK-FP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-FP-NEXT: vadd.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fadd_v4f16: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vadd.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vadd.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -102,25 +102,25 @@ define arm_aapcs_vfpcc half @fadd_v8f16(<8 x half> %x, half %y) { ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q2, q0 ; CHECK-FP-NEXT: vadd.f16 q0, q0, q2 -; CHECK-FP-NEXT: vadd.f16 s6, s2, s3 +; CHECK-FP-NEXT: vadd.f16 s2, s2, s3 ; CHECK-FP-NEXT: vadd.f16 s0, s0, s1 -; CHECK-FP-NEXT: vadd.f16 s0, s0, s6 +; CHECK-FP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-FP-NEXT: vadd.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fadd_v8f16: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 -; CHECK-NOFP-NEXT: vadd.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 -; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s2 -; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vadd.f16 s6, s6, s3 -; CHECK-NOFP-NEXT: vadd.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vadd.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -134,37 +134,37 @@ define arm_aapcs_vfpcc half @fadd_v16f16(<16 x half> %x, half %y) { ; CHECK-FP-NEXT: vadd.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vadd.f16 q0, q0, q1 -; CHECK-FP-NEXT: vadd.f16 s4, s2, s3 +; CHECK-FP-NEXT: vadd.f16 s2, s2, s3 ; CHECK-FP-NEXT: vadd.f16 s0, s0, s1 -; CHECK-FP-NEXT: vadd.f16 s0, s0, s4 +; CHECK-FP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-FP-NEXT: vadd.f16 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fadd_v16f16: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vadd.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vadd.f16 s12, s0, s4 -; CHECK-NOFP-NEXT: vadd.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vadd.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vadd.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 -; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vadd.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 -; CHECK-NOFP-NEXT: vadd.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vadd.f16 s12, s3, s7 -; CHECK-NOFP-NEXT: vadd.f16 s10, s10, s12 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s10 +; CHECK-NOFP-NEXT: vadd.f16 s4, s1, s5 ; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vadd.f16 s0, s10, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 +; CHECK-NOFP-NEXT: vadd.f16 s4, s10, s4 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vadd.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vadd.f16 s2, s2, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vadd.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vadd.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vadd.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -196,9 +196,9 @@ entry: define arm_aapcs_vfpcc double @fadd_v4f64(<4 x double> %x, double %y) { ; CHECK-LABEL: fadd_v4f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f64 d5, d1, d3 +; CHECK-NEXT: vadd.f64 d1, d1, d3 ; CHECK-NEXT: vadd.f64 d0, d0, d2 -; CHECK-NEXT: vadd.f64 d0, d0, d5 +; CHECK-NEXT: vadd.f64 d0, d0, d1 ; CHECK-NEXT: vadd.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: @@ -209,8 +209,8 @@ entry: define arm_aapcs_vfpcc float @fadd_v2f32_nofast(<2 x float> %x, float %y) { ; CHECK-LABEL: fadd_v2f32_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f32 s4, s4, s0 -; CHECK-NEXT: vadd.f32 s0, s4, s1 +; CHECK-NEXT: vadd.f32 s0, s4, s0 +; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x) @@ -220,10 +220,10 @@ entry: define arm_aapcs_vfpcc float @fadd_v4f32_nofast(<4 x float> %x, float %y) { ; CHECK-LABEL: fadd_v4f32_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f32 s4, s4, s0 -; CHECK-NEXT: vadd.f32 s4, s4, s1 -; CHECK-NEXT: vadd.f32 s4, s4, s2 -; CHECK-NEXT: vadd.f32 s0, s4, s3 +; CHECK-NEXT: vadd.f32 s0, s4, s0 +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s0, s0, s3 ; CHECK-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x) @@ -233,10 +233,10 @@ entry: define arm_aapcs_vfpcc float @fadd_v8f32_nofast(<8 x float> %x, float %y) { ; CHECK-LABEL: fadd_v8f32_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f32 s8, s8, s0 -; CHECK-NEXT: vadd.f32 s8, s8, s1 -; CHECK-NEXT: vadd.f32 s8, s8, s2 -; CHECK-NEXT: vadd.f32 s0, s8, s3 +; CHECK-NEXT: vadd.f32 s0, s8, s0 +; CHECK-NEXT: vadd.f32 s0, s0, s1 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s0, s0, s3 ; CHECK-NEXT: vadd.f32 s0, s0, s4 ; CHECK-NEXT: vadd.f32 s0, s0, s5 ; CHECK-NEXT: vadd.f32 s0, s0, s6 @@ -250,12 +250,12 @@ entry: define arm_aapcs_vfpcc half @fadd_v4f16_nofast(<4 x half> %x, half %y) { ; CHECK-LABEL: fadd_v4f16_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f16 s4, s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vadd.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vadd.f16 s4, s4, s1 -; CHECK-NEXT: vadd.f16 s0, s4, s0 +; CHECK-NEXT: vadd.f16 s2, s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vadd.f16 s0, s2, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vadd.f16 s0, s0, s1 +; CHECK-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x) @@ -266,17 +266,17 @@ define arm_aapcs_vfpcc half @fadd_v8f16_nofast(<8 x half> %x, half %y) { ; CHECK-LABEL: fadd_v8f16_nofast: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vadd.f16 s4, s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vadd.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vadd.f16 s4, s4, s1 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vadd.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vadd.f16 s4, s4, s2 -; CHECK-NEXT: vadd.f16 s4, s4, s6 -; CHECK-NEXT: vadd.f16 s4, s4, s3 +; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vadd.f16 s0, s4, s0 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vadd.f16 s0, s0, s1 +; CHECK-NEXT: vadd.f16 s0, s0, s4 +; CHECK-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s3 +; CHECK-NEXT: vadd.f16 s0, s0, s3 +; CHECK-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x) @@ -287,18 +287,18 @@ define arm_aapcs_vfpcc half @fadd_v16f16_nofast(<16 x half> %x, half %y) { ; CHECK-LABEL: fadd_v16f16_nofast: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vadd.f16 s8, s8, s0 -; CHECK-NEXT: vmovx.f16 s10, s0 -; CHECK-NEXT: vadd.f16 s8, s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vadd.f16 s8, s8, s1 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vadd.f16 s8, s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s2 -; CHECK-NEXT: vadd.f16 s8, s8, s2 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vadd.f16 s8, s8, s10 -; CHECK-NEXT: vadd.f16 s8, s8, s3 +; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vadd.f16 s0, s8, s0 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vadd.f16 s0, s0, s1 +; CHECK-NEXT: vadd.f16 s0, s0, s8 +; CHECK-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s3 +; CHECK-NEXT: vadd.f16 s0, s0, s3 +; CHECK-NEXT: vadd.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s4 ; CHECK-NEXT: vadd.f16 s0, s0, s4 ; CHECK-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NEXT: vmovx.f16 s2, s5 @@ -329,8 +329,8 @@ entry: define arm_aapcs_vfpcc double @fadd_v2f64_nofast(<2 x double> %x, double %y) { ; CHECK-LABEL: fadd_v2f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f64 d2, d2, d0 -; CHECK-NEXT: vadd.f64 d0, d2, d1 +; CHECK-NEXT: vadd.f64 d0, d2, d0 +; CHECK-NEXT: vadd.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x) @@ -340,8 +340,8 @@ entry: define arm_aapcs_vfpcc double @fadd_v4f64_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fadd_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vadd.f64 d4, d4, d0 -; CHECK-NEXT: vadd.f64 d0, d4, d1 +; CHECK-NEXT: vadd.f64 d0, d4, d0 +; CHECK-NEXT: vadd.f64 d0, d0, d1 ; CHECK-NEXT: vadd.f64 d0, d0, d2 ; CHECK-NEXT: vadd.f64 d0, d0, d3 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll index 07a0077b09301..7cafb7262f460 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll @@ -15,16 +15,16 @@ entry: define arm_aapcs_vfpcc float @fmin_v4f32(<4 x float> %x) { ; CHECK-FP-LABEL: fmin_v4f32: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vminnm.f32 s4, s0, s1 -; CHECK-NOFP-NEXT: vminnm.f32 s4, s4, s2 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x) @@ -36,9 +36,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32(<8 x float> %x) { ; CHECK-FP-LABEL: fmin_v8f32: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f32: @@ -49,15 +49,15 @@ define arm_aapcs_vfpcc float @fmin_v8f32(<8 x float> %x) { ; CHECK-NOFP-NEXT: vselgt.f32 s8, s1, s5 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vselgt.f32 s2, s2, s6 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vminnm.f32 s2, s10, s8 -; CHECK-NOFP-NEXT: vminnm.f32 s2, s2, s12 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s2, s0 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vselgt.f32 s4, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x) @@ -67,20 +67,20 @@ entry: define arm_aapcs_vfpcc half @fmin_v4f16(<4 x half> %x) { ; CHECK-FP-LABEL: fmin_v4f16: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s1 -; CHECK-FP-NEXT: vmovx.f16 s6, s0 -; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s4, s0 +; CHECK-FP-NEXT: vminnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x) @@ -92,24 +92,24 @@ define arm_aapcs_vfpcc half @fmin_v8f16(<8 x half> %x) { ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x) @@ -122,9 +122,9 @@ define arm_aapcs_vfpcc half @fmin_v16f16(<16 x half> %x) { ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16: @@ -132,42 +132,42 @@ define arm_aapcs_vfpcc half @fmin_v16f16(<16 x half> %x) { ; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 ; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s5 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vcmp.f16 s4, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s8, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s2, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vcmp.f16 s4, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vcmp.f16 s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x) @@ -199,10 +199,10 @@ define arm_aapcs_vfpcc double @fmin_v4f64(<4 x double> %x) { ; CHECK-NEXT: vcmp.f64 d3, d1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f64 d2, d0 -; CHECK-NEXT: vselgt.f64 d4, d1, d3 +; CHECK-NEXT: vselgt.f64 d1, d1, d3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vminnm.f64 d0, d0, d4 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x) @@ -222,16 +222,16 @@ entry: define arm_aapcs_vfpcc float @fmin_v4f32_nofast(<4 x float> %x) { ; CHECK-FP-LABEL: fmin_v4f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vminnm.f32 s4, s0, s1 -; CHECK-NOFP-NEXT: vminnm.f32 s4, s4, s2 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s3 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x) @@ -242,20 +242,20 @@ define arm_aapcs_vfpcc float @fmin_v8f32_nofast(<8 x float> %x) { ; CHECK-FP-LABEL: fmin_v8f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vminnm.f32 s10, s0, s4 ; CHECK-NOFP-NEXT: vminnm.f32 s8, s1, s5 -; CHECK-NOFP-NEXT: vminnm.f32 s8, s10, s8 -; CHECK-NOFP-NEXT: vminnm.f32 s10, s2, s6 -; CHECK-NOFP-NEXT: vminnm.f32 s8, s8, s10 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s8, s0 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f32 s2, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s2, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x) @@ -265,20 +265,20 @@ entry: define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmin_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s1 -; CHECK-FP-NEXT: vmovx.f16 s6, s0 -; CHECK-FP-NEXT: vminnm.f16 s4, s1, s4 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s4, s0 +; CHECK-FP-NEXT: vminnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x) @@ -290,24 +290,24 @@ define arm_aapcs_vfpcc half @fmin_v8f16_nofast(<8 x half> %x) { ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s4, s4, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x) @@ -320,36 +320,36 @@ define arm_aapcs_vfpcc half @fmin_v16f16_nofast(<16 x half> %x) { ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v16f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s1, s5 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s2, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vminnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s1, s5 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s8, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s2, s2, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x) @@ -378,9 +378,9 @@ entry: define arm_aapcs_vfpcc double @fmin_v4f64_nofast(<4 x double> %x) { ; CHECK-LABEL: fmin_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vminnm.f64 d4, d1, d3 +; CHECK-NEXT: vminnm.f64 d1, d1, d3 ; CHECK-NEXT: vminnm.f64 d0, d0, d2 -; CHECK-NEXT: vminnm.f64 d0, d0, d4 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x) @@ -403,17 +403,17 @@ entry: define arm_aapcs_vfpcc float @fmin_v4f32_acc(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v4f32_acc: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vminnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vminnm.f32 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f32_acc: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vminnm.f32 s6, s0, s1 -; CHECK-NOFP-NEXT: vminnm.f32 s6, s6, s2 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: vminnm.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -427,9 +427,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc(<8 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v8f32_acc: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vminnm.f32 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; @@ -441,15 +441,15 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc(<8 x float> %x, float %y) { ; CHECK-NOFP-NEXT: vselgt.f32 s10, s1, s5 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f32 s14, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s10 +; CHECK-NOFP-NEXT: vselgt.f32 s2, s2, s6 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vminnm.f32 s2, s12, s10 -; CHECK-NOFP-NEXT: vminnm.f32 s2, s2, s14 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s2, s0 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vselgt.f32 s4, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vminnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -462,21 +462,21 @@ entry: define arm_aapcs_vfpcc half @fmin_v4f16_acc(<4 x half> %x, half %y) { ; CHECK-FP-LABEL: fmin_v4f16_acc: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmovx.f16 s8, s0 -; CHECK-FP-NEXT: vminnm.f16 s6, s1, s6 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s8 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vminnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v4f16_acc: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -489,8 +489,8 @@ entry: define arm_aapcs_vfpcc half @fmin_v2f16_acc(<2 x half> %x, half %y) { ; CHECK-LABEL: fmin_v2f16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NEXT: bx lr entry: @@ -505,25 +505,25 @@ define arm_aapcs_vfpcc half @fmin_v8f16_acc(<8 x half> %x, half %y) { ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q2, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q2 -; CHECK-FP-NEXT: vminnm.f16 s6, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmin_v8f16_acc: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -539,9 +539,9 @@ define arm_aapcs_vfpcc half @fmin_v16f16_acc(<16 x half> %x, half %y) { ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; @@ -550,42 +550,42 @@ define arm_aapcs_vfpcc half @fmin_v16f16_acc(<16 x half> %x, half %y) { ; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 ; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s5, s1 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s14 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vcmp.f16 s4, s10 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s6, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s10, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s14 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vcmp.f16 s4, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s7, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vcmp.f16 s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s10, s0 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -626,10 +626,10 @@ define arm_aapcs_vfpcc double @fmin_v4f64_acc(<4 x double> %x, double %y) { ; CHECK-NEXT: vcmp.f64 d3, d1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f64 d2, d0 -; CHECK-NEXT: vselgt.f64 d5, d1, d3 +; CHECK-NEXT: vselgt.f64 d1, d1, d3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vminnm.f64 d0, d0, d5 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: vminnm.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: @@ -657,9 +657,9 @@ entry: define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v4f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vminnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f32 s0, s4 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 @@ -667,9 +667,9 @@ define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) { ; ; CHECK-NOFP-LABEL: fmin_v4f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vminnm.f32 s6, s0, s1 -; CHECK-NOFP-NEXT: vminnm.f32 s6, s6, s2 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: vcmp.f32 s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 @@ -685,9 +685,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc_nofast(<8 x float> %x, float %y) { ; CHECK-FP-LABEL: fmin_v8f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vminnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f32 s0, s8 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s8, s0 @@ -695,13 +695,13 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc_nofast(<8 x float> %x, float %y) { ; ; CHECK-NOFP-LABEL: fmin_v8f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vminnm.f32 s12, s0, s4 ; CHECK-NOFP-NEXT: vminnm.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vminnm.f32 s10, s12, s10 -; CHECK-NOFP-NEXT: vminnm.f32 s12, s2, s6 -; CHECK-NOFP-NEXT: vminnm.f32 s10, s10, s12 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vminnm.f32 s0, s10, s0 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s10 +; CHECK-NOFP-NEXT: vminnm.f32 s2, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f32 s2, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f32 s0, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0 @@ -716,11 +716,11 @@ entry: define arm_aapcs_vfpcc half @fmin_v4f16_acc_nofast(<4 x half> %x, half %y) { ; CHECK-FP-LABEL: fmin_v4f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmovx.f16 s8, s0 -; CHECK-FP-NEXT: vminnm.f16 s6, s1, s6 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s8 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vminnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f16 s0, s4 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 @@ -728,11 +728,11 @@ define arm_aapcs_vfpcc half @fmin_v4f16_acc_nofast(<4 x half> %x, half %y) { ; ; CHECK-NOFP-LABEL: fmin_v4f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s4, s0 @@ -749,9 +749,9 @@ define arm_aapcs_vfpcc half @fmin_v8f16_acc_nofast(<8 x half> %x, half %y) { ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q2, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q2 -; CHECK-FP-NEXT: vminnm.f16 s6, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f16 s0, s4 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 @@ -760,16 +760,16 @@ define arm_aapcs_vfpcc half @fmin_v8f16_acc_nofast(<8 x half> %x, half %y) { ; CHECK-NOFP-LABEL: fmin_v8f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vminnm.f16 s6, s6, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s4, s0 @@ -787,9 +787,9 @@ define arm_aapcs_vfpcc half @fmin_v16f16_acc_nofast(<16 x half> %x, half %y) { ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vminnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vminnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vminnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vminnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f16 s0, s8 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f16 s0, s8, s0 @@ -797,29 +797,29 @@ define arm_aapcs_vfpcc half @fmin_v16f16_acc_nofast(<16 x half> %x, half %y) { ; ; CHECK-NOFP-LABEL: fmin_v16f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vminnm.f16 s12, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vminnm.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vminnm.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vminnm.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 -; CHECK-NOFP-NEXT: vminnm.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vminnm.f16 s12, s3, s7 -; CHECK-NOFP-NEXT: vminnm.f16 s10, s10, s12 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s10 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s10, s4 ; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vminnm.f16 s0, s10, s0 +; CHECK-NOFP-NEXT: vminnm.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s2, s2, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vminnm.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vminnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f16 s0, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s8, s0 @@ -863,9 +863,9 @@ entry: define arm_aapcs_vfpcc double @fmin_v4f64_acc_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fmin_v4f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vminnm.f64 d5, d1, d3 +; CHECK-NEXT: vminnm.f64 d1, d1, d3 ; CHECK-NEXT: vminnm.f64 d0, d0, d2 -; CHECK-NEXT: vminnm.f64 d0, d0, d5 +; CHECK-NEXT: vminnm.f64 d0, d0, d1 ; CHECK-NEXT: vcmp.f64 d0, d4 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d4, d0 @@ -890,16 +890,16 @@ entry: define arm_aapcs_vfpcc float @fmax_v4f32(<4 x float> %x) { ; CHECK-FP-LABEL: fmax_v4f32: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s4, s2 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x) @@ -910,9 +910,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32(<8 x float> %x) { ; CHECK-FP-LABEL: fmax_v8f32: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f32: @@ -923,15 +923,15 @@ define arm_aapcs_vfpcc float @fmax_v8f32(<8 x float> %x) { ; CHECK-NOFP-NEXT: vselgt.f32 s8, s1, s5 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f32 s10, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vselgt.f32 s2, s2, s6 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s10, s8 -; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s2, s12 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s2, s0 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vselgt.f32 s4, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x) @@ -941,20 +941,20 @@ entry: define arm_aapcs_vfpcc half @fmax_v4f16(<4 x half> %x) { ; CHECK-FP-LABEL: fmax_v4f16: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s1 -; CHECK-FP-NEXT: vmovx.f16 s6, s0 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s4, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x) @@ -966,24 +966,24 @@ define arm_aapcs_vfpcc half @fmax_v8f16(<8 x half> %x) { ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f16: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x) @@ -996,9 +996,9 @@ define arm_aapcs_vfpcc half @fmax_v16f16(<16 x half> %x) { ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16: @@ -1006,42 +1006,42 @@ define arm_aapcs_vfpcc half @fmax_v16f16(<16 x half> %x) { ; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 ; CHECK-NOFP-NEXT: vselgt.f16 s8, s10, s8 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s1, s5 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vcmp.f16 s8, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s8, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s2, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vcmp.f16 s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vselgt.f16 s10, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vcmp.f16 s4, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call fast half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x) @@ -1073,10 +1073,10 @@ define arm_aapcs_vfpcc double @fmax_v4f64(<4 x double> %x) { ; CHECK-NEXT: vcmp.f64 d1, d3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f64 d0, d2 -; CHECK-NEXT: vselgt.f64 d4, d1, d3 +; CHECK-NEXT: vselgt.f64 d1, d1, d3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vmaxnm.f64 d0, d0, d4 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x) @@ -1096,16 +1096,16 @@ entry: define arm_aapcs_vfpcc float @fmax_v4f32_nofast(<4 x float> %x) { ; CHECK-FP-LABEL: fmax_v4f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f32 s4, s4, s2 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x) @@ -1116,20 +1116,20 @@ define arm_aapcs_vfpcc float @fmax_v8f32_nofast(<8 x float> %x) { ; CHECK-FP-LABEL: fmax_v8f32_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f32_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s0, s4 ; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s1, s5 -; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s10, s8 -; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s2, s6 -; CHECK-NOFP-NEXT: vmaxnm.f32 s8, s8, s10 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s8, s0 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x) @@ -1139,20 +1139,20 @@ entry: define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) { ; CHECK-FP-LABEL: fmax_v4f16_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s4, s1 -; CHECK-FP-NEXT: vmovx.f16 s6, s0 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s1, s4 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s4, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x) @@ -1164,24 +1164,24 @@ define arm_aapcs_vfpcc half @fmax_v8f16_nofast(<8 x half> %x) { ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s4, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s0, s4 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s6, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s4, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x) @@ -1194,36 +1194,36 @@ define arm_aapcs_vfpcc half @fmax_v16f16_nofast(<16 x half> %x) { ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v16f16_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s10, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s10, s8 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s1, s5 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s5 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s2, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s10, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s3, s7 -; CHECK-NOFP-NEXT: vmaxnm.f16 s8, s8, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s8 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s1, s5 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s8, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s2, s2, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x) @@ -1252,9 +1252,9 @@ entry: define arm_aapcs_vfpcc double @fmax_v4f64_nofast(<4 x double> %x) { ; CHECK-LABEL: fmax_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmaxnm.f64 d4, d1, d3 +; CHECK-NEXT: vmaxnm.f64 d1, d1, d3 ; CHECK-NEXT: vmaxnm.f64 d0, d0, d2 -; CHECK-NEXT: vmaxnm.f64 d0, d0, d4 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x) @@ -1277,17 +1277,17 @@ entry: define arm_aapcs_vfpcc float @fmax_v4f32_acc(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v4f32_acc: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmaxnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f32_acc: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s6, s2 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -1301,9 +1301,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc(<8 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v8f32_acc: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; @@ -1315,15 +1315,15 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc(<8 x float> %x, float %y) { ; CHECK-NOFP-NEXT: vselgt.f32 s10, s1, s5 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f32 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f32 s14, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s10 +; CHECK-NOFP-NEXT: vselgt.f32 s2, s2, s6 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vselgt.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s12, s10 -; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s2, s14 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s2, s0 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vselgt.f32 s4, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -1336,8 +1336,8 @@ entry: define arm_aapcs_vfpcc half @fmax_v2f16_acc(<2 x half> %x, half %y) { ; CHECK-LABEL: fmax_v2f16_acc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NEXT: bx lr entry: @@ -1350,21 +1350,21 @@ entry: define arm_aapcs_vfpcc half @fmax_v4f16_acc(<4 x half> %x, half %y) { ; CHECK-FP-LABEL: fmax_v4f16_acc: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmovx.f16 s8, s0 -; CHECK-FP-NEXT: vmaxnm.f16 s6, s1, s6 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s8 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v4f16_acc: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -1379,25 +1379,25 @@ define arm_aapcs_vfpcc half @fmax_v8f16_acc(<8 x half> %x, half %y) { ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q2, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q2 -; CHECK-FP-NEXT: vmaxnm.f16 s6, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmax_v8f16_acc: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -1413,9 +1413,9 @@ define arm_aapcs_vfpcc half @fmax_v16f16_acc(<16 x half> %x, half %y) { ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; @@ -1424,42 +1424,42 @@ define arm_aapcs_vfpcc half @fmax_v16f16_acc(<16 x half> %x, half %y) { ; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 ; CHECK-NOFP-NEXT: vcmp.f16 s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 ; CHECK-NOFP-NEXT: vselgt.f16 s10, s12, s10 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s1, s5 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 -; CHECK-NOFP-NEXT: vcmp.f16 s14, s12 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s10 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vcmp.f16 s10, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s2, s6 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s10, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 -; CHECK-NOFP-NEXT: vcmp.f16 s14, s12 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vselgt.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vcmp.f16 s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vcmp.f16 s3, s7 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s14, s12 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s2, s4 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vcmp.f16 s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vselgt.f16 s12, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vcmp.f16 s4, s2 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vselgt.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s10, s0 +; CHECK-NOFP-NEXT: vselgt.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -1500,10 +1500,10 @@ define arm_aapcs_vfpcc double @fmax_v4f64_acc(<4 x double> %x, double %y) { ; CHECK-NEXT: vcmp.f64 d1, d3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f64 d0, d2 -; CHECK-NEXT: vselgt.f64 d5, d1, d3 +; CHECK-NEXT: vselgt.f64 d1, d1, d3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d0, d2 -; CHECK-NEXT: vmaxnm.f64 d0, d0, d5 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: vmaxnm.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: @@ -1531,9 +1531,9 @@ entry: define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v4f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmaxnm.f32 s6, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f32 s4, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s4, s0 @@ -1541,9 +1541,9 @@ define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) { ; ; CHECK-NOFP-LABEL: fmax_v4f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f32 s6, s6, s2 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: vcmp.f32 s4, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s4, s0 @@ -1559,9 +1559,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc_nofast(<8 x float> %x, float %y) { ; CHECK-FP-LABEL: fmax_v8f32_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmaxnm.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f32 s8, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f32 s0, s8, s0 @@ -1569,13 +1569,13 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc_nofast(<8 x float> %x, float %y) { ; ; CHECK-NOFP-LABEL: fmax_v8f32_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmaxnm.f32 s12, s0, s4 ; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s12, s10 -; CHECK-NOFP-NEXT: vmaxnm.f32 s12, s2, s6 -; CHECK-NOFP-NEXT: vmaxnm.f32 s10, s10, s12 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s10, s0 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s10 +; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f32 s2, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f32 s8, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f32 s0, s8, s0 @@ -1590,11 +1590,11 @@ entry: define arm_aapcs_vfpcc half @fmax_v4f16_acc_nofast(<4 x half> %x, half %y) { ; CHECK-FP-LABEL: fmax_v4f16_acc_nofast: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmovx.f16 s8, s0 -; CHECK-FP-NEXT: vmaxnm.f16 s6, s1, s6 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s8 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s1, s2 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f16 s4, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 @@ -1602,11 +1602,11 @@ define arm_aapcs_vfpcc half @fmax_v4f16_acc_nofast(<4 x half> %x, half %y) { ; ; CHECK-NOFP-LABEL: fmax_v4f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s4, s0 @@ -1623,9 +1623,9 @@ define arm_aapcs_vfpcc half @fmax_v8f16_acc_nofast(<8 x half> %x, half %y) { ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q2, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q2 -; CHECK-FP-NEXT: vmaxnm.f16 s6, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f16 s4, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f16 s0, s4, s0 @@ -1634,16 +1634,16 @@ define arm_aapcs_vfpcc half @fmax_v8f16_acc_nofast(<8 x half> %x, half %y) { ; CHECK-NOFP-LABEL: fmax_v8f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmaxnm.f16 s6, s6, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f16 s4, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s4, s0 @@ -1661,9 +1661,9 @@ define arm_aapcs_vfpcc half @fmax_v16f16_acc_nofast(<16 x half> %x, half %y) { ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmaxnm.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmaxnm.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmaxnm.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-FP-NEXT: vcmp.f16 s8, s0 ; CHECK-FP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-FP-NEXT: vselgt.f16 s0, s8, s0 @@ -1671,29 +1671,29 @@ define arm_aapcs_vfpcc half @fmax_v16f16_acc_nofast(<16 x half> %x, half %y) { ; ; CHECK-NOFP-LABEL: fmax_v16f16_acc_nofast: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmaxnm.f16 s12, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmaxnm.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmaxnm.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmaxnm.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 -; CHECK-NOFP-NEXT: vmaxnm.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmaxnm.f16 s12, s3, s7 -; CHECK-NOFP-NEXT: vmaxnm.f16 s10, s10, s12 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s10 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s10, s4 ; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s10, s0 +; CHECK-NOFP-NEXT: vmaxnm.f16 s4, s2, s6 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s2, s2, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vmaxnm.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vmaxnm.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vcmp.f16 s8, s0 ; CHECK-NOFP-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NOFP-NEXT: vselgt.f16 s0, s8, s0 @@ -1737,9 +1737,9 @@ entry: define arm_aapcs_vfpcc double @fmax_v4f64_acc_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fmax_v4f64_acc_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmaxnm.f64 d5, d1, d3 +; CHECK-NEXT: vmaxnm.f64 d1, d1, d3 ; CHECK-NEXT: vmaxnm.f64 d0, d0, d2 -; CHECK-NEXT: vmaxnm.f64 d0, d0, d5 +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 ; CHECK-NEXT: vcmp.f64 d4, d0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vselgt.f64 d0, d4, d0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll index bbc30d99d10c6..b847b05f566f1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll @@ -16,17 +16,17 @@ entry: define arm_aapcs_vfpcc float @fmul_v4f32(<4 x float> %x, float %y) { ; CHECK-FP-LABEL: fmul_v4f32: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmul.f32 s6, s2, s3 +; CHECK-FP-NEXT: vmul.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmul.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmul.f32 s0, s0, s6 +; CHECK-FP-NEXT: vmul.f32 s0, s0, s2 ; CHECK-FP-NEXT: vmul.f32 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmul_v4f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmul.f32 s6, s0, s1 -; CHECK-NOFP-NEXT: vmul.f32 s6, s6, s2 -; CHECK-NOFP-NEXT: vmul.f32 s0, s6, s3 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s1 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s3 ; CHECK-NOFP-NEXT: vmul.f32 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -38,21 +38,21 @@ define arm_aapcs_vfpcc float @fmul_v8f32(<8 x float> %x, float %y) { ; CHECK-FP-LABEL: fmul_v8f32: ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vmul.f32 q0, q0, q1 -; CHECK-FP-NEXT: vmul.f32 s4, s2, s3 +; CHECK-FP-NEXT: vmul.f32 s2, s2, s3 ; CHECK-FP-NEXT: vmul.f32 s0, s0, s1 -; CHECK-FP-NEXT: vmul.f32 s0, s0, s4 +; CHECK-FP-NEXT: vmul.f32 s0, s0, s2 ; CHECK-FP-NEXT: vmul.f32 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmul_v8f32: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmul.f32 s12, s0, s4 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmul.f32 s10, s1, s5 -; CHECK-NOFP-NEXT: vmul.f32 s14, s2, s6 -; CHECK-NOFP-NEXT: vmul.f32 s0, s3, s7 -; CHECK-NOFP-NEXT: vmul.f32 s10, s12, s10 -; CHECK-NOFP-NEXT: vmul.f32 s2, s10, s14 -; CHECK-NOFP-NEXT: vmul.f32 s0, s2, s0 +; CHECK-NOFP-NEXT: vmul.f32 s2, s2, s6 +; CHECK-NOFP-NEXT: vmul.f32 s4, s3, s7 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s10 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s2 +; CHECK-NOFP-NEXT: vmul.f32 s0, s0, s4 ; CHECK-NOFP-NEXT: vmul.f32 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -63,8 +63,8 @@ entry: define arm_aapcs_vfpcc half @fmul_v2f16(<2 x half> %x, half %y) { ; CHECK-LABEL: fmul_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vmul.f16 s0, s0, s6 +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NEXT: vmul.f16 s0, s4, s0 ; CHECK-NEXT: bx lr entry: @@ -75,21 +75,21 @@ entry: define arm_aapcs_vfpcc half @fmul_v4f16(<4 x half> %x, half %y) { ; CHECK-FP-LABEL: fmul_v4f16: ; CHECK-FP: @ %bb.0: @ %entry -; CHECK-FP-NEXT: vmovx.f16 s6, s1 -; CHECK-FP-NEXT: vmovx.f16 s8, s0 -; CHECK-FP-NEXT: vmul.f16 s6, s1, s6 -; CHECK-FP-NEXT: vmul.f16 s0, s0, s8 +; CHECK-FP-NEXT: vmovx.f16 s2, s1 +; CHECK-FP-NEXT: vmovx.f16 s6, s0 +; CHECK-FP-NEXT: vmul.f16 s2, s1, s2 ; CHECK-FP-NEXT: vmul.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmul.f16 s0, s0, s2 ; CHECK-FP-NEXT: vmul.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmul_v4f16: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmul.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s1 -; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vmul.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s0 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s1 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -102,25 +102,25 @@ define arm_aapcs_vfpcc half @fmul_v8f16(<8 x half> %x, half %y) { ; CHECK-FP: @ %bb.0: @ %entry ; CHECK-FP-NEXT: vrev32.16 q2, q0 ; CHECK-FP-NEXT: vmul.f16 q0, q0, q2 -; CHECK-FP-NEXT: vmul.f16 s6, s2, s3 +; CHECK-FP-NEXT: vmul.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmul.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmul.f16 s0, s0, s6 +; CHECK-FP-NEXT: vmul.f16 s0, s0, s2 ; CHECK-FP-NEXT: vmul.f16 s0, s4, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmul_v8f16: ; CHECK-NOFP: @ %bb.0: @ %entry ; CHECK-NOFP-NEXT: vmovx.f16 s6, s0 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s1 -; CHECK-NOFP-NEXT: vmul.f16 s6, s0, s6 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s1 -; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmovx.f16 s8, s2 -; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s2 -; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s8 -; CHECK-NOFP-NEXT: vmul.f16 s6, s6, s3 -; CHECK-NOFP-NEXT: vmul.f16 s0, s6, s0 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s6, s1 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s1 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s6 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s3 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s3 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vmul.f16 s0, s4, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -134,37 +134,37 @@ define arm_aapcs_vfpcc half @fmul_v16f16(<16 x half> %x, half %y) { ; CHECK-FP-NEXT: vmul.f16 q0, q0, q1 ; CHECK-FP-NEXT: vrev32.16 q1, q0 ; CHECK-FP-NEXT: vmul.f16 q0, q0, q1 -; CHECK-FP-NEXT: vmul.f16 s4, s2, s3 +; CHECK-FP-NEXT: vmul.f16 s2, s2, s3 ; CHECK-FP-NEXT: vmul.f16 s0, s0, s1 -; CHECK-FP-NEXT: vmul.f16 s0, s0, s4 +; CHECK-FP-NEXT: vmul.f16 s0, s0, s2 ; CHECK-FP-NEXT: vmul.f16 s0, s8, s0 ; CHECK-FP-NEXT: bx lr ; ; CHECK-NOFP-LABEL: fmul_v16f16: ; CHECK-NOFP: @ %bb.0: @ %entry -; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 ; CHECK-NOFP-NEXT: vmovx.f16 s12, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s4 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4 ; CHECK-NOFP-NEXT: vmul.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmul.f16 s12, s0, s4 -; CHECK-NOFP-NEXT: vmul.f16 s10, s12, s10 -; CHECK-NOFP-NEXT: vmul.f16 s12, s1, s5 -; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s5 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s1 -; CHECK-NOFP-NEXT: vmovx.f16 s4, s7 -; CHECK-NOFP-NEXT: vmul.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s14, s2 -; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmul.f16 s12, s2, s6 -; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s12, s6 -; CHECK-NOFP-NEXT: vmul.f16 s12, s14, s12 -; CHECK-NOFP-NEXT: vmovx.f16 s0, s3 -; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 -; CHECK-NOFP-NEXT: vmul.f16 s12, s3, s7 -; CHECK-NOFP-NEXT: vmul.f16 s10, s10, s12 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s10 +; CHECK-NOFP-NEXT: vmul.f16 s4, s1, s5 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s5 +; CHECK-NOFP-NEXT: vmovx.f16 s10, s1 +; CHECK-NOFP-NEXT: vmul.f16 s4, s10, s4 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4 +; CHECK-NOFP-NEXT: vmul.f16 s4, s2, s6 ; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s4 -; CHECK-NOFP-NEXT: vmul.f16 s0, s10, s0 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s6 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s2 +; CHECK-NOFP-NEXT: vmul.f16 s2, s2, s4 +; CHECK-NOFP-NEXT: vmovx.f16 s4, s3 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmul.f16 s2, s3, s7 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NOFP-NEXT: vmovx.f16 s2, s7 +; CHECK-NOFP-NEXT: vmul.f16 s2, s4, s2 +; CHECK-NOFP-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NOFP-NEXT: vmul.f16 s0, s8, s0 ; CHECK-NOFP-NEXT: bx lr entry: @@ -196,9 +196,9 @@ entry: define arm_aapcs_vfpcc double @fmul_v4f64(<4 x double> %x, double %y) { ; CHECK-LABEL: fmul_v4f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f64 d5, d1, d3 +; CHECK-NEXT: vmul.f64 d1, d1, d3 ; CHECK-NEXT: vmul.f64 d0, d0, d2 -; CHECK-NEXT: vmul.f64 d0, d0, d5 +; CHECK-NEXT: vmul.f64 d0, d0, d1 ; CHECK-NEXT: vmul.f64 d0, d4, d0 ; CHECK-NEXT: bx lr entry: @@ -209,8 +209,8 @@ entry: define arm_aapcs_vfpcc float @fmul_v2f32_nofast(<2 x float> %x, float %y) { ; CHECK-LABEL: fmul_v2f32_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f32 s4, s4, s0 -; CHECK-NEXT: vmul.f32 s0, s4, s1 +; CHECK-NEXT: vmul.f32 s0, s4, s0 +; CHECK-NEXT: vmul.f32 s0, s0, s1 ; CHECK-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x) @@ -220,10 +220,10 @@ entry: define arm_aapcs_vfpcc float @fmul_v4f32_nofast(<4 x float> %x, float %y) { ; CHECK-LABEL: fmul_v4f32_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f32 s4, s4, s0 -; CHECK-NEXT: vmul.f32 s4, s4, s1 -; CHECK-NEXT: vmul.f32 s4, s4, s2 -; CHECK-NEXT: vmul.f32 s0, s4, s3 +; CHECK-NEXT: vmul.f32 s0, s4, s0 +; CHECK-NEXT: vmul.f32 s0, s0, s1 +; CHECK-NEXT: vmul.f32 s0, s0, s2 +; CHECK-NEXT: vmul.f32 s0, s0, s3 ; CHECK-NEXT: bx lr entry: %z = call float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x) @@ -233,10 +233,10 @@ entry: define arm_aapcs_vfpcc float @fmul_v8f32_nofast(<8 x float> %x, float %y) { ; CHECK-LABEL: fmul_v8f32_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f32 s8, s8, s0 -; CHECK-NEXT: vmul.f32 s8, s8, s1 -; CHECK-NEXT: vmul.f32 s8, s8, s2 -; CHECK-NEXT: vmul.f32 s0, s8, s3 +; CHECK-NEXT: vmul.f32 s0, s8, s0 +; CHECK-NEXT: vmul.f32 s0, s0, s1 +; CHECK-NEXT: vmul.f32 s0, s0, s2 +; CHECK-NEXT: vmul.f32 s0, s0, s3 ; CHECK-NEXT: vmul.f32 s0, s0, s4 ; CHECK-NEXT: vmul.f32 s0, s0, s5 ; CHECK-NEXT: vmul.f32 s0, s0, s6 @@ -250,9 +250,9 @@ entry: define arm_aapcs_vfpcc half @fmul_v2f16_nofast(<2 x half> %x, half %y) { ; CHECK-LABEL: fmul_v2f16_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f16 s4, s4, s0 +; CHECK-NEXT: vmul.f16 s2, s4, s0 ; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmul.f16 s0, s4, s0 +; CHECK-NEXT: vmul.f16 s0, s2, s0 ; CHECK-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x) @@ -262,12 +262,12 @@ entry: define arm_aapcs_vfpcc half @fmul_v4f16_nofast(<4 x half> %x, half %y) { ; CHECK-LABEL: fmul_v4f16_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f16 s4, s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vmul.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmul.f16 s4, s4, s1 -; CHECK-NEXT: vmul.f16 s0, s4, s0 +; CHECK-NEXT: vmul.f16 s2, s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmul.f16 s0, s2, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x) @@ -278,17 +278,17 @@ define arm_aapcs_vfpcc half @fmul_v8f16_nofast(<8 x half> %x, half %y) { ; CHECK-LABEL: fmul_v8f16_nofast: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmul.f16 s4, s4, s0 -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vmul.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmul.f16 s4, s4, s1 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmul.f16 s4, s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vmul.f16 s4, s4, s2 -; CHECK-NEXT: vmul.f16 s4, s4, s6 -; CHECK-NEXT: vmul.f16 s4, s4, s3 +; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vmul.f16 s0, s4, s0 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s4 +; CHECK-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s3 +; CHECK-NEXT: vmul.f16 s0, s0, s3 +; CHECK-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NEXT: bx lr entry: %z = call half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x) @@ -299,18 +299,18 @@ define arm_aapcs_vfpcc half @fmul_v16f16_nofast(<16 x half> %x, half %y) { ; CHECK-LABEL: fmul_v16f16_nofast: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmul.f16 s8, s8, s0 -; CHECK-NEXT: vmovx.f16 s10, s0 -; CHECK-NEXT: vmul.f16 s8, s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vmul.f16 s8, s8, s1 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmul.f16 s8, s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s2 -; CHECK-NEXT: vmul.f16 s8, s8, s2 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vmul.f16 s8, s8, s10 -; CHECK-NEXT: vmul.f16 s8, s8, s3 +; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vmul.f16 s0, s8, s0 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s1 +; CHECK-NEXT: vmul.f16 s0, s0, s8 +; CHECK-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s3 +; CHECK-NEXT: vmul.f16 s0, s0, s3 +; CHECK-NEXT: vmul.f16 s0, s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s4 ; CHECK-NEXT: vmul.f16 s0, s0, s4 ; CHECK-NEXT: vmul.f16 s0, s0, s2 ; CHECK-NEXT: vmovx.f16 s2, s5 @@ -341,8 +341,8 @@ entry: define arm_aapcs_vfpcc double @fmul_v2f64_nofast(<2 x double> %x, double %y) { ; CHECK-LABEL: fmul_v2f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f64 d2, d2, d0 -; CHECK-NEXT: vmul.f64 d0, d2, d1 +; CHECK-NEXT: vmul.f64 d0, d2, d0 +; CHECK-NEXT: vmul.f64 d0, d0, d1 ; CHECK-NEXT: bx lr entry: %z = call double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x) @@ -352,8 +352,8 @@ entry: define arm_aapcs_vfpcc double @fmul_v4f64_nofast(<4 x double> %x, double %y) { ; CHECK-LABEL: fmul_v4f64_nofast: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmul.f64 d4, d4, d0 -; CHECK-NEXT: vmul.f64 d0, d4, d1 +; CHECK-NEXT: vmul.f64 d0, d4, d0 +; CHECK-NEXT: vmul.f64 d0, d0, d1 ; CHECK-NEXT: vmul.f64 d0, d0, d2 ; CHECK-NEXT: vmul.f64 d0, d0, d3 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll index 7bcc0193217d3..f5adcf0427649 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll @@ -503,10 +503,10 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: vadd.f32 q0, q1, q0 ; CHECK-NEXT: le lr, .LBB5_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vadd.f32 s4, s2, s3 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: vadd.f32 s0, s0, s4 +; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: beq .LBB5_9 ; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r2 @@ -601,10 +601,10 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: vmul.f32 q0, q1, q0 ; CHECK-NEXT: le lr, .LBB6_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmul.f32 s4, s2, s3 +; CHECK-NEXT: vmul.f32 s2, s2, s3 ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: vmul.f32 s0, s0, s1 -; CHECK-NEXT: vmul.f32 s0, s0, s4 +; CHECK-NEXT: vmul.f32 s0, s0, s2 ; CHECK-NEXT: beq .LBB6_9 ; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1 ; CHECK-NEXT: sub.w lr, r1, r2 @@ -1464,9 +1464,9 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: le lr, .LBB15_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vminnm.f32 s4, s2, s3 +; CHECK-NEXT: vminnm.f32 s2, s2, s3 ; CHECK-NEXT: vminnm.f32 s0, s0, s1 -; CHECK-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: beq .LBB15_9 ; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1 @@ -1567,9 +1567,9 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) { ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: le lr, .LBB16_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block -; CHECK-NEXT: vmaxnm.f32 s4, s2, s3 +; CHECK-NEXT: vmaxnm.f32 s2, s2, s3 ; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 -; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: beq .LBB16_9 ; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll index d06a5418c70da..bf966ee17b7e6 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll @@ -54,17 +54,17 @@ define arm_aapcs_vfpcc <8 x i16> @vrhadd_s16(<8 x i16> %src1, <8 x i16> %src2) { define arm_aapcs_vfpcc <4 x i32> @vrhadd_s32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vrhadd_s32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s16, s6 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: .vsave {d9} +; CHECK-NEXT: vpush {d9} +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov.f32 s14, s1 +; CHECK-NEXT: vmov.f32 s18, s5 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 @@ -77,27 +77,26 @@ define arm_aapcs_vfpcc <4 x i32> @vrhadd_s32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-NEXT: adc.w r3, r2, r3, asr #31 ; CHECK-NEXT: adds r2, r1, #1 ; CHECK-NEXT: adc r1, r3, #0 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: adds r0, #1 ; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: asrs r2, r1, #31 ; CHECK-NEXT: adds r1, r1, r3 ; CHECK-NEXT: adc.w r3, r2, r3, asr #31 ; CHECK-NEXT: adds r2, r1, #1 ; CHECK-NEXT: adc r1, r3, #0 ; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 +; CHECK-NEXT: vpop {d9} ; CHECK-NEXT: bx lr %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -153,17 +152,17 @@ define arm_aapcs_vfpcc <8 x i16> @vhadd_s16(<8 x i16> %src1, <8 x i16> %src2) { define arm_aapcs_vfpcc <4 x i32> @vhadd_s32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vhadd_s32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s16, s6 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: .vsave {d9} +; CHECK-NEXT: vpush {d9} +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov.f32 s14, s1 +; CHECK-NEXT: vmov.f32 s18, s5 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 @@ -173,22 +172,21 @@ define arm_aapcs_vfpcc <4 x i32> @vhadd_s32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-NEXT: asr.w r12, r1, #31 ; CHECK-NEXT: adc.w r1, r12, r3, asr #31 ; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: adds r2, r1, r3 ; CHECK-NEXT: asr.w r12, r1, #31 ; CHECK-NEXT: adc.w r1, r12, r3, asr #31 ; CHECK-NEXT: lsrl r2, r1, #1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 +; CHECK-NEXT: vpop {d9} ; CHECK-NEXT: bx lr %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -255,10 +253,10 @@ define arm_aapcs_vfpcc <4 x i32> @vrhadd_u32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.i64 q4, #0xffffffff -; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vand q2, q2, q4 +; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vand q3, q3, q4 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov r2, r3, d6 @@ -356,10 +354,10 @@ define arm_aapcs_vfpcc <4 x i32> @vhadd_u32(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.i64 q4, #0xffffffff -; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vand q2, q2, q4 +; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vand q3, q3, q4 ; CHECK-NEXT: vmov r0, r1, d4 ; CHECK-NEXT: vmov r2, r3, d6 @@ -498,23 +496,23 @@ define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d9} +; CHECK-NEXT: vpush {d9} ; CHECK-NEXT: mov.w lr, #256 ; CHECK-NEXT: .LBB14_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov.f32 s18, s5 +; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: asrs r4, r3, #31 ; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: asr.w r4, r3, #31 ; CHECK-NEXT: adc.w r3, r4, r5, asr #31 ; CHECK-NEXT: vmov r5, s4 ; CHECK-NEXT: lsrl r12, r3, #1 @@ -523,24 +521,24 @@ define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly ; CHECK-NEXT: asr.w r4, r3, #31 ; CHECK-NEXT: adc.w r3, r4, r5, asr #31 ; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov q4[2], q4[0], r6, r12 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov q3[2], q3[0], r6, r12 ; CHECK-NEXT: adds r4, r3, r5 ; CHECK-NEXT: asr.w r6, r3, #31 ; CHECK-NEXT: adc.w r3, r6, r5, asr #31 ; CHECK-NEXT: lsrl r4, r3, #1 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: adds r6, r3, r5 ; CHECK-NEXT: asr.w r12, r3, #31 ; CHECK-NEXT: adc.w r3, r12, r5, asr #31 ; CHECK-NEXT: lsrl r6, r3, #1 -; CHECK-NEXT: vmov q4[3], q4[1], r6, r4 -; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: vmov q3[3], q3[1], r6, r4 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 ; CHECK-NEXT: le lr, .LBB14_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d9} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: br label %vector.body @@ -677,10 +675,10 @@ define void @vhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly ; CHECK-NEXT: vldrw.u32 q3, [r0], #16 ; CHECK-NEXT: vldrw.u32 q4, [r1], #16 ; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s8, s18 ; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vmov.f32 s8, s18 ; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: vmov.f32 s10, s19 ; CHECK-NEXT: vand q2, q2, q0 ; CHECK-NEXT: vmov r3, r5, d2 ; CHECK-NEXT: vmov r4, r6, d4 @@ -859,10 +857,10 @@ define void @vrhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly ; CHECK-NEXT: vldrw.u32 q3, [r1], #16 ; CHECK-NEXT: vldrw.u32 q4, [r0], #16 ; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s8, s18 ; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vmov.f32 s8, s18 ; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: vmov.f32 s10, s19 ; CHECK-NEXT: vand q2, q2, q0 ; CHECK-NEXT: vmov r3, r12, d2 ; CHECK-NEXT: vmov r4, r5, d4 @@ -1049,10 +1047,10 @@ define void @vrhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly ; CHECK-NEXT: vldrw.u32 q3, [r1], #16 ; CHECK-NEXT: vldrw.u32 q4, [r0], #16 ; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s8, s18 ; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vmov.f32 s8, s18 ; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: vmov.f32 s10, s19 ; CHECK-NEXT: vand q2, q2, q0 ; CHECK-NEXT: vmov r3, r12, d2 ; CHECK-NEXT: vmov r4, r5, d4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll index 6bd3ee578b89c..693afc151c796 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll @@ -70,15 +70,13 @@ define <4 x i64> *@vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) { ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0], #32 -; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s10, s6 ; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov r4, r7, d4 +; CHECK-NEXT: vmov lr, r12, d3 ; CHECK-NEXT: vmov r2, r5, d0 -; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vmov r4, r7, d4 ; CHECK-NEXT: vmov r3, r6, d1 ; CHECK-NEXT: adds.w r3, r3, lr ; CHECK-NEXT: adc.w r6, r6, r12 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll index 93967f052b0aa..f9c4965d05ca1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -77,12 +77,10 @@ define void @vld2_v16i32(<32 x i32> *%src, <16 x i32> *%dst) { ; CHECK-NEXT: vld20.32 {q5, q6}, [r0] ; CHECK-NEXT: vadd.i32 q0, q0, q1 ; CHECK-NEXT: vld20.32 {q1, q2}, [r3] -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4 -; CHECK-NEXT: vld21.32 {q5, q6}, [r0] ; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vld21.32 {q5, q6}, [r0] ; CHECK-NEXT: vld21.32 {q1, q2}, [r3] ; CHECK-NEXT: vstrw.32 q3, [r1, #48] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 ; CHECK-NEXT: vadd.i32 q5, q5, q6 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vadd.i32 q1, q1, q2 @@ -102,14 +100,14 @@ entry: define void @vld2_v4i32_align1(<8 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vld2_v4i32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vmov.f32 s8, s5 ; CHECK-NEXT: vmov.f32 s9, s7 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s7, s2 ; CHECK-NEXT: vadd.i32 q0, q1, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -207,25 +205,25 @@ define void @vld2_v8i16_align1(<16 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld2_v8i16_align1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vldrb.u8 q2, [r0, #16] ; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmovx.f16 s5, s2 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vldrb.u8 q2, [r0, #16] -; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vmovx.f16 s6, s3 ; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vins.f16 s5, s6 ; CHECK-NEXT: vmovx.f16 s6, s8 ; CHECK-NEXT: vins.f16 s6, s12 -; CHECK-NEXT: vmovx.f16 s12, s11 ; CHECK-NEXT: vmovx.f16 s7, s10 -; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vins.f16 s10, s11 ; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vins.f16 s7, s12 ; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vins.f16 s10, s11 ; CHECK-NEXT: vmov.f32 s3, s10 ; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -324,15 +322,13 @@ define void @vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) { ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s10, s6 ; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov r0, r4, d4 +; CHECK-NEXT: vmov lr, r12, d3 ; CHECK-NEXT: vmov r5, r6, d0 -; CHECK-NEXT: vmov lr, r12, d5 +; CHECK-NEXT: vmov r0, r4, d4 ; CHECK-NEXT: vmov r3, r2, d1 ; CHECK-NEXT: adds.w r3, r3, lr ; CHECK-NEXT: adc.w r2, r2, r12 @@ -356,34 +352,30 @@ define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov lr, r12, d5 ; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: vmov.f32 s3, s9 ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f64 d8, d5 +; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov.f32 s18, s14 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r5, r6, d4 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov.f32 s19, s15 -; CHECK-NEXT: vmov.f32 s11, s13 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s3, s13 ; CHECK-NEXT: vmov r0, r7, d8 -; CHECK-NEXT: vmov r5, r6, d4 ; CHECK-NEXT: adds.w lr, lr, r2 ; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, r4, d9 +; CHECK-NEXT: vmov r3, r4, d7 ; CHECK-NEXT: adds r0, r0, r5 ; CHECK-NEXT: adc.w r8, r6, r7 -; CHECK-NEXT: vmov r6, r5, d5 +; CHECK-NEXT: vmov r6, r5, d1 ; CHECK-NEXT: vmov r2, r7, d0 ; CHECK-NEXT: adds r3, r3, r6 ; CHECK-NEXT: adc.w r6, r5, r4 @@ -396,7 +388,7 @@ define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) { ; CHECK-NEXT: adc.w r0, r7, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <8 x i64>, <8 x i64>* %src, align 8 @@ -480,12 +472,10 @@ define void @vld2_v16f32(<32 x float> *%src, <16 x float> *%dst) { ; CHECK-NEXT: vld20.32 {q5, q6}, [r0] ; CHECK-NEXT: vadd.f32 q0, q0, q1 ; CHECK-NEXT: vld20.32 {q1, q2}, [r3] -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4 -; CHECK-NEXT: vld21.32 {q5, q6}, [r0] ; CHECK-NEXT: vadd.f32 q3, q3, q4 +; CHECK-NEXT: vld21.32 {q5, q6}, [r0] ; CHECK-NEXT: vld21.32 {q1, q2}, [r3] ; CHECK-NEXT: vstrw.32 q3, [r1, #48] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 ; CHECK-NEXT: vadd.f32 q5, q5, q6 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vadd.f32 q1, q1, q2 @@ -505,14 +495,14 @@ entry: define void @vld2_v4f32_align1(<8 x float> *%src, <4 x float> *%dst) { ; CHECK-LABEL: vld2_v4f32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] +; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vmov.f32 s8, s5 ; CHECK-NEXT: vmov.f32 s9, s7 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s7, s2 ; CHECK-NEXT: vadd.f32 q0, q1, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -535,11 +525,11 @@ define void @vld2_v2f16(<4 x half> *%src, <2 x half> *%dst) { ; CHECK-NEXT: ldr r0, [r0, #4] ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s4, s2 ; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vadd.f16 q0, q0, q2 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: bx lr @@ -556,14 +546,14 @@ define void @vld2_v4f16(<8 x half> *%src, <4 x half> *%dst) { ; CHECK-LABEL: vld2_v4f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vmovx.f16 s8, s1 ; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmovx.f16 s5, s2 -; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmovx.f16 s6, s3 ; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vins.f16 s5, s8 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s5, s6 ; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r0, r2, d0 @@ -620,25 +610,25 @@ define void @vld2_v8f16_align1(<16 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld2_v8f16_align1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vldrb.u8 q2, [r0, #16] ; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmovx.f16 s5, s2 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vldrb.u8 q2, [r0, #16] -; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vmovx.f16 s6, s3 ; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vins.f16 s5, s6 ; CHECK-NEXT: vmovx.f16 s6, s8 ; CHECK-NEXT: vins.f16 s6, s12 -; CHECK-NEXT: vmovx.f16 s12, s11 ; CHECK-NEXT: vmovx.f16 s7, s10 +; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vins.f16 s10, s11 ; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vins.f16 s7, s12 +; CHECK-NEXT: vmov.f32 s2, s8 ; CHECK-NEXT: vmov.f32 s3, s10 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index b998d62b0d9c6..bf76ba3a513ca 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -10,7 +10,6 @@ define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) { ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: ldrd r2, r0, [r0, #16] -; CHECK-NEXT: vmov.f64 d2, d0 ; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vmov r12, lr, d0 ; CHECK-NEXT: vmov r3, s6 @@ -37,20 +36,20 @@ define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 ; CHECK-NEXT: vmov.f32 s3, s19 ; CHECK-NEXT: vadd.i32 q0, q2, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -72,37 +71,37 @@ define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vadd.i32 q0, q2, q0 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f64 d10, d4 ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 ; CHECK-NEXT: vmov.f32 s23, s13 ; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s6, s12 ; CHECK-NEXT: vmov.f32 s7, s15 ; CHECK-NEXT: vadd.i32 q1, q4, q1 ; CHECK-NEXT: vstrw.32 q1, [r1] @@ -124,71 +123,71 @@ define void @vld3_v16i32(<48 x i32> *%src, <16 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #80] ; CHECK-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vadd.i32 q0, q2, q0 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f64 d10, d4 ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s21, s11 ; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s23, s13 ; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160] ; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vadd.i32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s7, s15 ; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vadd.i32 q1, q4, q1 -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s21, s8 ; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vmov.f32 s21, s8 ; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vmov.f32 s16, s12 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vmov.f32 s23, s26 ; CHECK-NEXT: vmov.f32 s19, s25 ; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vmov.f32 s11, s27 +; CHECK-NEXT: vldrw.u32 q5, [r0, #128] ; CHECK-NEXT: vadd.i32 q2, q4, q2 ; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f64 d14, d8 -; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vmov.f32 s25, s12 -; CHECK-NEXT: vmov.f32 s29, s19 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s30, s14 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s24, s17 ; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vmov.f32 s28, s16 +; CHECK-NEXT: vmov.f32 s29, s19 ; CHECK-NEXT: vmov.f32 s31, s21 ; CHECK-NEXT: vadd.i32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov.f32 s14, s20 ; CHECK-NEXT: vmov.f32 s15, s23 ; CHECK-NEXT: vadd.i32 q3, q6, q3 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] @@ -216,23 +215,22 @@ define void @vld3_v2i16(<6 x i16> *%src, <2 x i16> *%dst) { ; CHECK-NEXT: ldr r2, [r0, #8] ; CHECK-NEXT: mov r3, sp ; CHECK-NEXT: str r2, [sp] -; CHECK-NEXT: vmov.f64 d2, d0 ; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f64 d6, d1 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vldrh.u32 q1, [r3] -; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov.f32 s6, s4 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: strh r0, [r1, #2] ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: strh r0, [r1] ; CHECK-NEXT: add sp, #8 @@ -292,49 +290,49 @@ define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vins.f16 s0, s8 ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vins.f16 s1, s12 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vmovx.f16 s2, s6 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmovx.f16 s5, s5 +; CHECK-NEXT: vins.f16 s1, s2 ; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmovx.f16 s2, s15 ; CHECK-NEXT: vmov.f32 s18, s12 +; CHECK-NEXT: vins.f16 s19, s2 +; CHECK-NEXT: vmov.f32 s2, s11 ; CHECK-NEXT: vmov q5, q4 -; CHECK-NEXT: vmovnb.i32 q5, q0 -; CHECK-NEXT: vmov.f32 s2, s22 -; CHECK-NEXT: vmovx.f16 s20, s5 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov.f64 d8, d2 -; CHECK-NEXT: vins.f16 s16, s20 -; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vmov.f32 s16, s4 +; CHECK-NEXT: vins.f16 s16, s5 +; CHECK-NEXT: vmovx.f16 s5, s8 ; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmovx.f16 s20, s11 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vins.f16 s17, s5 +; CHECK-NEXT: vmovx.f16 s5, s11 ; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vins.f16 s18, s20 -; CHECK-NEXT: vmovx.f16 s20, s14 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmovx.f16 s11, s13 +; CHECK-NEXT: vins.f16 s18, s5 +; CHECK-NEXT: vmovx.f16 s5, s7 +; CHECK-NEXT: vmovnb.i32 q5, q0 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vmovx.f16 s14, s14 ; CHECK-NEXT: vmov.f32 s19, s13 -; CHECK-NEXT: vins.f16 s19, s20 -; CHECK-NEXT: vmovx.f16 s20, s4 -; CHECK-NEXT: vins.f16 s20, s6 -; CHECK-NEXT: vmovx.f16 s21, s7 -; CHECK-NEXT: vins.f16 s6, s12 -; CHECK-NEXT: vmovx.f16 s7, s13 -; CHECK-NEXT: vins.f16 s21, s9 -; CHECK-NEXT: vins.f16 s7, s15 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmovnb.i32 q2, q5 -; CHECK-NEXT: vmov.f32 s22, s10 -; CHECK-NEXT: vmov.f32 s23, s7 -; CHECK-NEXT: vadd.i16 q1, q4, q5 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vins.f16 s5, s9 +; CHECK-NEXT: vins.f16 s10, s12 +; CHECK-NEXT: vins.f16 s11, s15 +; CHECK-NEXT: vins.f16 s19, s14 +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmovnb.i32 q3, q1 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vmov.f32 s2, s22 +; CHECK-NEXT: vadd.i16 q1, q4, q1 ; CHECK-NEXT: vadd.i16 q0, q1, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} @@ -355,103 +353,98 @@ define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f64 d0, d2 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vins.f16 s0, s8 ; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s8 ; CHECK-NEXT: vmov.f32 s1, s7 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmovx.f16 s16, s9 -; CHECK-NEXT: vins.f16 s1, s12 ; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: vins.f16 s1, s2 ; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmovx.f16 s14, s18 +; CHECK-NEXT: vmov.f32 s3, s17 ; CHECK-NEXT: vins.f16 s2, s12 ; CHECK-NEXT: vmovx.f16 s12, s6 +; CHECK-NEXT: vins.f16 s3, s14 +; CHECK-NEXT: vmovx.f16 s14, s19 +; CHECK-NEXT: vins.f16 s18, s14 ; CHECK-NEXT: vins.f16 s5, s12 +; CHECK-NEXT: vmovx.f16 s12, s9 ; CHECK-NEXT: vmov.f32 s13, s8 -; CHECK-NEXT: vins.f16 s13, s16 -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vins.f16 s13, s12 ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmovx.f16 s20, s18 -; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vins.f16 s3, s20 -; CHECK-NEXT: vmovx.f16 s20, s19 -; CHECK-NEXT: vins.f16 s18, s20 +; CHECK-NEXT: vmovx.f16 s5, s7 +; CHECK-NEXT: vmov.u16 r2, q2[5] ; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vmovx.f16 s11, s17 ; CHECK-NEXT: vmov.f32 s23, s18 ; CHECK-NEXT: vmov.f32 s22, s16 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vins.f16 s5, s9 ; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vins.f16 s10, s16 +; CHECK-NEXT: vins.f16 s11, s19 ; CHECK-NEXT: vmovnb.i32 q6, q3 -; CHECK-NEXT: vmov.f32 s14, s26 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vmovx.f16 s20, s4 -; CHECK-NEXT: vins.f16 s20, s6 -; CHECK-NEXT: vmovx.f16 s21, s7 -; CHECK-NEXT: vins.f16 s6, s16 -; CHECK-NEXT: vmovx.f16 s7, s17 -; CHECK-NEXT: vins.f16 s21, s9 -; CHECK-NEXT: vins.f16 s7, s19 -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmovnb.i32 q2, q5 -; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmovnb.i32 q4, q1 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s6, s18 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s23, s7 -; CHECK-NEXT: vadd.i16 q0, q0, q5 -; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmov.f32 s14, s26 +; CHECK-NEXT: vmovx.f16 s6, s10 ; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vins.f16 s4, s12 ; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmovx.f16 s16, s13 -; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vins.f16 s5, s16 -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmovx.f16 s20, s19 +; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vmovx.f16 s7, s19 ; CHECK-NEXT: vmov.f32 s27, s18 -; CHECK-NEXT: vins.f16 s27, s20 -; CHECK-NEXT: vmov.f64 d10, d4 -; CHECK-NEXT: vins.f16 s20, s0 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s13 +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vins.f16 s27, s7 ; CHECK-NEXT: vmov.f32 s26, s16 -; CHECK-NEXT: vmovx.f16 s0, s12 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vins.f16 s21, s0 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmov.f32 s6, s15 ; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov.f32 s20, s8 ; CHECK-NEXT: vmovnb.i32 q7, q1 -; CHECK-NEXT: vmovx.f16 s0, s15 +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: vins.f16 s20, s6 +; CHECK-NEXT: vmovx.f16 s6, s12 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmovx.f16 s8, s8 +; CHECK-NEXT: vmovx.f16 s9, s11 +; CHECK-NEXT: vins.f16 s21, s6 +; CHECK-NEXT: vmovx.f16 s6, s15 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmovx.f16 s15, s17 ; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vins.f16 s22, s0 +; CHECK-NEXT: vins.f16 s8, s10 +; CHECK-NEXT: vins.f16 s9, s13 +; CHECK-NEXT: vins.f16 s14, s16 +; CHECK-NEXT: vins.f16 s15, s19 +; CHECK-NEXT: vins.f16 s22, s6 +; CHECK-NEXT: vmovx.f16 s6, s18 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vins.f16 s23, s6 +; CHECK-NEXT: vmovnb.i32 q4, q2 +; CHECK-NEXT: vmov.f32 s11, s15 +; CHECK-NEXT: vmov.f32 s10, s18 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s6, s30 +; CHECK-NEXT: vadd.i16 q2, q5, q2 ; CHECK-NEXT: vmov.f32 s7, s27 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vins.f16 s24, s10 -; CHECK-NEXT: vins.f16 s23, s0 -; CHECK-NEXT: vins.f16 s2, s16 -; CHECK-NEXT: vmovx.f16 s25, s11 -; CHECK-NEXT: vmovx.f16 s3, s17 -; CHECK-NEXT: vins.f16 s25, s13 -; CHECK-NEXT: vins.f16 s3, s19 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmovnb.i32 q2, q6 -; CHECK-NEXT: vmov.f32 s26, s10 -; CHECK-NEXT: vmov.f32 s27, s3 -; CHECK-NEXT: vadd.i16 q0, q5, q6 -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vadd.i16 q1, q2, q1 +; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -558,22 +551,21 @@ define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) { ; CHECK-NEXT: vmov.u8 r0, q0[0] ; CHECK-NEXT: vmov.16 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.16 q3[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.16 q3[1], r0 ; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: vmov.16 q3[2], r0 ; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] ; CHECK-NEXT: vmov.16 q3[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vmov.16 q2[4], r2 ; CHECK-NEXT: vmov.16 q3[4], r0 ; CHECK-NEXT: vmov.u8 r0, q0[15] ; CHECK-NEXT: vmovx.f16 s16, s6 @@ -581,6 +573,7 @@ define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) { ; CHECK-NEXT: vmovx.f16 s11, s5 ; CHECK-NEXT: vmov.16 q3[5], r0 ; CHECK-NEXT: vins.f16 s18, s16 +; CHECK-NEXT: vins.f16 s10, s4 ; CHECK-NEXT: vins.f16 s11, s7 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.u8 r0, q0[2] @@ -653,12 +646,11 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) { ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[14] ; CHECK-NEXT: vmov.8 q4[15], r0 -; CHECK-NEXT: vmov.u8 r0, q2[2] ; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov.f32 s15, s19 ; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.f32 s14, s22 -; CHECK-NEXT: vmov.f32 s15, s19 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q1[3] ; CHECK-NEXT: vmov.8 q4[1], r0 @@ -681,19 +673,20 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) { ; CHECK-NEXT: vmov.u8 r0, q0[14] ; CHECK-NEXT: vmov.8 q4[10], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] +; CHECK-NEXT: vmov.f32 s14, s22 ; CHECK-NEXT: vmov.8 q5[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[7] +; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vmov.8 q5[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[10] ; CHECK-NEXT: vmov.8 q5[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[13] ; CHECK-NEXT: vmov.8 q5[15], r0 ; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vmov.8 q6[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.f32 s18, s26 ; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.u8 r0, q1[2] ; CHECK-NEXT: vadd.i8 q3, q4, q3 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q1[5] @@ -753,19 +746,15 @@ define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) { ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f64 d6, d1 +; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov r5, r8, d6 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov r0, r3, d5 +; CHECK-NEXT: vmov r2, r4, d3 ; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: vmov r0, r3, d1 -; CHECK-NEXT: vmov lr, r12, d7 -; CHECK-NEXT: vmov r2, r4, d5 +; CHECK-NEXT: vmov r5, r8, d6 +; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: adds.w r0, r0, lr ; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 @@ -795,50 +784,42 @@ define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q6, [r0, #80] -; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vldrw.u32 q4, [r0, #64] ; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s7, s13 -; CHECK-NEXT: vmov.f32 s11, s15 -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.f64 d10, d7 -; CHECK-NEXT: vmov lr, r12, d3 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov r3, r8, d5 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vmov.f32 s22, s24 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s23, s25 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vmov r6, r7, d10 -; CHECK-NEXT: vmov.f32 s19, s27 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: vmov r3, r8, d7 +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vmov.f32 s24, s22 +; CHECK-NEXT: vmov.f32 s25, s23 +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vmov r6, r7, d12 ; CHECK-NEXT: adds.w r0, r5, lr ; CHECK-NEXT: adc.w r5, r4, r12 ; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r4, r2, d6 +; CHECK-NEXT: vmov r4, r2, d10 ; CHECK-NEXT: adc.w r12, r5, r8 ; CHECK-NEXT: vmov r5, r0, d8 ; CHECK-NEXT: adds r6, r6, r4 ; CHECK-NEXT: adcs r2, r7 ; CHECK-NEXT: adds r6, r6, r5 ; CHECK-NEXT: adc.w r8, r2, r0 -; CHECK-NEXT: vmov r7, r4, d11 -; CHECK-NEXT: vmov r2, r5, d7 +; CHECK-NEXT: vmov r7, r4, d1 +; CHECK-NEXT: vmov r2, r5, d9 ; CHECK-NEXT: vmov r3, r0, d0 ; CHECK-NEXT: adds r2, r2, r7 ; CHECK-NEXT: adc.w r7, r5, r4 -; CHECK-NEXT: vmov r5, r4, d9 +; CHECK-NEXT: vmov r5, r4, d7 ; CHECK-NEXT: adds r2, r2, r5 ; CHECK-NEXT: adcs r7, r4 ; CHECK-NEXT: vmov r5, r4, d2 @@ -853,7 +834,7 @@ define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) { ; CHECK-NEXT: adcs r0, r5 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <12 x i64>, <12 x i64>* %src, align 4 @@ -874,7 +855,7 @@ define void @vld3_v2f32(<6 x float> *%src, <2 x float> *%dst) { ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldr s1, [r0, #16] ; CHECK-NEXT: vldr s5, [r0, #20] -; CHECK-NEXT: vmov.f64 d6, d4 +; CHECK-NEXT: vmov.f32 s12, s8 ; CHECK-NEXT: vmov.f32 s13, s11 ; CHECK-NEXT: vmov.f32 s0, s9 ; CHECK-NEXT: vadd.f32 q0, q3, q0 @@ -898,20 +879,20 @@ define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 ; CHECK-NEXT: vmov.f32 s3, s19 ; CHECK-NEXT: vadd.f32 q0, q2, q0 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -933,37 +914,37 @@ define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vadd.f32 q0, q2, q0 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f64 d10, d4 ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vmov.f32 s16, s9 ; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 ; CHECK-NEXT: vmov.f32 s23, s13 ; CHECK-NEXT: vadd.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s6, s12 ; CHECK-NEXT: vmov.f32 s7, s15 ; CHECK-NEXT: vadd.f32 q1, q4, q1 ; CHECK-NEXT: vstrw.32 q1, [r1] @@ -985,71 +966,71 @@ define void @vld3_v16f32(<48 x float> *%src, <16 x float> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #80] ; CHECK-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s15, s18 ; CHECK-NEXT: vmov.f32 s11, s17 ; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vadd.f32 q0, q2, q0 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f64 d10, d4 ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s21, s11 ; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s23, s13 ; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vldrw.u32 q2, [r0, #160] ; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s23, s13 +; CHECK-NEXT: vadd.f32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s7, s15 ; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vadd.f32 q1, q4, q1 -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s21, s8 ; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vmov.f32 s21, s8 ; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vmov.f32 s16, s12 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s20, s13 ; CHECK-NEXT: vmov.f32 s23, s26 ; CHECK-NEXT: vmov.f32 s19, s25 ; CHECK-NEXT: vadd.f32 q4, q4, q5 +; CHECK-NEXT: vmov.f32 s8, s14 +; CHECK-NEXT: vmov.f32 s10, s24 +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vmov.f32 s11, s27 +; CHECK-NEXT: vldrw.u32 q5, [r0, #128] ; CHECK-NEXT: vadd.f32 q2, q4, q2 ; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f64 d14, d8 -; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vmov.f32 s25, s12 -; CHECK-NEXT: vmov.f32 s29, s19 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f32 s30, s14 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s24, s17 ; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vmov.f32 s28, s16 +; CHECK-NEXT: vmov.f32 s29, s19 ; CHECK-NEXT: vmov.f32 s31, s21 ; CHECK-NEXT: vadd.f32 q6, q7, q6 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov.f32 s14, s20 ; CHECK-NEXT: vmov.f32 s15, s23 ; CHECK-NEXT: vadd.f32 q3, q6, q3 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] @@ -1079,9 +1060,9 @@ define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) { ; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vmovx.f16 s4, s2 ; CHECK-NEXT: vins.f16 s8, s2 -; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vmovx.f16 s2, s1 ; CHECK-NEXT: vins.f16 s1, s4 -; CHECK-NEXT: vins.f16 s0, s6 +; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vadd.f16 q1, q0, q2 ; CHECK-NEXT: vmov.f32 s0, s1 ; CHECK-NEXT: vadd.f16 q0, q1, q0 @@ -1102,32 +1083,29 @@ entry: define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) { ; CHECK-LABEL: vld3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: ldrd r2, r3, [r0, #16] -; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r3 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmovx.f16 s4, s9 -; CHECK-NEXT: vins.f16 s1, s4 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmovx.f16 s8, s8 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vins.f16 s12, s6 -; CHECK-NEXT: vins.f16 s4, s16 -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vins.f16 s5, s16 +; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmovx.f16 s13, s7 -; CHECK-NEXT: vins.f16 s7, s8 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vins.f16 s7, s0 ; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vins.f16 s12, s6 ; CHECK-NEXT: vins.f16 s13, s9 ; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vadd.f16 q1, q1, q3 ; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: strd r0, r2, [r1] -; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <12 x half>, <12 x half>* %src, align 4 @@ -1143,49 +1121,47 @@ entry: define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s17 -; CHECK-NEXT: vmov.f32 s5, s16 -; CHECK-NEXT: vmovx.f16 s20, s15 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vmov.f32 s11, s14 -; CHECK-NEXT: vins.f16 s11, s20 -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmovx.f16 s20, s12 -; CHECK-NEXT: vmov.f32 s28, s18 -; CHECK-NEXT: vins.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s20, s19 -; CHECK-NEXT: vins.f16 s28, s20 -; CHECK-NEXT: vmovx.f16 s24, s1 -; CHECK-NEXT: vmovx.f16 s20, s0 -; CHECK-NEXT: vins.f16 s0, s24 -; CHECK-NEXT: vins.f16 s20, s2 -; CHECK-NEXT: vmovx.f16 s26, s16 -; CHECK-NEXT: vmovx.f16 s21, s3 -; CHECK-NEXT: vins.f16 s3, s26 -; CHECK-NEXT: vins.f16 s21, s17 -; CHECK-NEXT: vmovx.f16 s30, s14 -; CHECK-NEXT: vmovx.f16 s23, s13 -; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vmovx.f16 s6, s2 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vmovx.f16 s6, s13 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmovx.f16 s6, s11 +; CHECK-NEXT: vins.f16 s7, s6 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmovx.f16 s16, s8 +; CHECK-NEXT: vmovx.f16 s12, s12 +; CHECK-NEXT: vmovx.f16 s15, s15 +; CHECK-NEXT: vmov.f32 s20, s14 +; CHECK-NEXT: vmovx.f16 s10, s10 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vmovx.f16 s18, s14 +; CHECK-NEXT: vins.f16 s6, s16 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vmovx.f16 s1, s1 +; CHECK-NEXT: vins.f16 s20, s15 +; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vins.f16 s9, s10 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s16, s2 ; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s13, s30 -; CHECK-NEXT: vins.f16 s23, s15 -; CHECK-NEXT: vmov.f32 s2, s28 -; CHECK-NEXT: vmovx.f16 s22, s18 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vins.f16 s22, s12 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vadd.f16 q0, q0, q5 +; CHECK-NEXT: vins.f16 s17, s13 +; CHECK-NEXT: vins.f16 s19, s11 +; CHECK-NEXT: vins.f16 s18, s8 +; CHECK-NEXT: vmov.f32 s2, s20 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vadd.f16 q0, q0, q4 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x half>, <24 x half>* %src, align 4 @@ -1201,89 +1177,85 @@ entry: define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld3_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s13 +; CHECK-NEXT: vmovx.f16 s6, s2 +; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vmovx.f16 s24, s1 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vmovx.f16 s6, s13 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmovx.f16 s6, s11 +; CHECK-NEXT: vins.f16 s7, s6 ; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmovx.f16 s26, s12 -; CHECK-NEXT: vmovx.f16 s20, s11 -; CHECK-NEXT: vmov.f32 s19, s10 -; CHECK-NEXT: vins.f16 s19, s20 -; CHECK-NEXT: vmovx.f16 s20, s8 -; CHECK-NEXT: vins.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s20, s15 -; CHECK-NEXT: vmov.f32 s28, s14 -; CHECK-NEXT: vmovx.f16 s30, s10 -; CHECK-NEXT: vins.f16 s28, s20 -; CHECK-NEXT: vmovx.f16 s20, s0 -; CHECK-NEXT: vins.f16 s0, s24 -; CHECK-NEXT: vins.f16 s20, s2 -; CHECK-NEXT: vmovx.f16 s21, s3 -; CHECK-NEXT: vins.f16 s3, s26 -; CHECK-NEXT: vins.f16 s21, s13 -; CHECK-NEXT: vmov.f32 s18, s8 -; CHECK-NEXT: vmovx.f16 s23, s9 +; CHECK-NEXT: vmovx.f16 s16, s8 +; CHECK-NEXT: vmovx.f16 s12, s12 +; CHECK-NEXT: vmovx.f16 s15, s15 +; CHECK-NEXT: vmov.f32 s20, s14 +; CHECK-NEXT: vmovx.f16 s10, s10 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vmovx.f16 s18, s14 +; CHECK-NEXT: vins.f16 s6, s16 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vmovx.f16 s1, s1 +; CHECK-NEXT: vins.f16 s20, s15 +; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vins.f16 s9, s10 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s16, s2 +; CHECK-NEXT: vins.f16 s17, s13 ; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s9, s30 -; CHECK-NEXT: vins.f16 s23, s11 -; CHECK-NEXT: vmovx.f16 s22, s14 -; CHECK-NEXT: vmov.f32 s2, s28 -; CHECK-NEXT: vins.f16 s22, s8 +; CHECK-NEXT: vins.f16 s19, s11 +; CHECK-NEXT: vins.f16 s18, s8 ; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s7, s19 -; CHECK-NEXT: vadd.f16 q0, q0, q5 +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s20 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vmov.f32 s20, s14 ; CHECK-NEXT: vadd.f16 q1, q0, q1 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmovx.f16 s20, s11 -; CHECK-NEXT: vins.f16 s4, s16 -; CHECK-NEXT: vmovx.f16 s16, s13 ; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vmovx.f16 s24, s1 -; CHECK-NEXT: vins.f16 s5, s16 -; CHECK-NEXT: vmov.f32 s19, s10 -; CHECK-NEXT: vins.f16 s19, s20 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vmovx.f16 s6, s2 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s13 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vmovx.f16 s6, s11 +; CHECK-NEXT: vmovx.f16 s16, s8 +; CHECK-NEXT: vins.f16 s7, s6 ; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmovx.f16 s20, s8 -; CHECK-NEXT: vmov.f32 s28, s14 -; CHECK-NEXT: vins.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s20, s15 -; CHECK-NEXT: vins.f16 s28, s20 -; CHECK-NEXT: vmovx.f16 s20, s0 -; CHECK-NEXT: vins.f16 s0, s24 -; CHECK-NEXT: vins.f16 s20, s2 -; CHECK-NEXT: vmovx.f16 s21, s3 -; CHECK-NEXT: vmovx.f16 s26, s12 -; CHECK-NEXT: vins.f16 s21, s13 -; CHECK-NEXT: vins.f16 s3, s26 -; CHECK-NEXT: vmovx.f16 s30, s10 -; CHECK-NEXT: vmovx.f16 s23, s9 -; CHECK-NEXT: vmov.f32 s18, s8 -; CHECK-NEXT: vins.f16 s9, s30 -; CHECK-NEXT: vins.f16 s23, s11 +; CHECK-NEXT: vmovx.f16 s15, s15 +; CHECK-NEXT: vmovx.f16 s12, s12 +; CHECK-NEXT: vmovx.f16 s10, s10 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vmovx.f16 s18, s14 +; CHECK-NEXT: vins.f16 s6, s16 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vmovx.f16 s1, s1 +; CHECK-NEXT: vins.f16 s20, s15 +; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vins.f16 s9, s10 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s16, s2 ; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmovx.f16 s22, s14 -; CHECK-NEXT: vmov.f32 s2, s28 -; CHECK-NEXT: vins.f16 s22, s8 +; CHECK-NEXT: vins.f16 s17, s13 +; CHECK-NEXT: vins.f16 s19, s11 +; CHECK-NEXT: vins.f16 s18, s8 +; CHECK-NEXT: vmov.f32 s2, s20 ; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s7, s19 -; CHECK-NEXT: vadd.f16 q0, q0, q5 +; CHECK-NEXT: vadd.f16 q0, q0, q4 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: bx lr entry: %l1 = load <48 x half>, <48 x half>* %src, align 4 @@ -1303,11 +1275,11 @@ define void @vld3_v2f64(<6 x double> *%src, <2 x double> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vadd.f64 d4, d3, d0 -; CHECK-NEXT: vadd.f64 d5, d6, d7 -; CHECK-NEXT: vadd.f64 d1, d4, d1 -; CHECK-NEXT: vadd.f64 d0, d5, d2 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vadd.f64 d0, d3, d0 +; CHECK-NEXT: vadd.f64 d3, d4, d5 +; CHECK-NEXT: vadd.f64 d1, d0, d1 +; CHECK-NEXT: vadd.f64 d0, d3, d2 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1324,25 +1296,25 @@ entry: define void @vld3_v4f64(<12 x double> *%src, <4 x double> *%dst) { ; CHECK-LABEL: vld3_v4f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q1, [r0, #80] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vadd.f64 d5, d6, d7 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vadd.f64 d4, d1, d2 -; CHECK-NEXT: vadd.f64 d10, d9, d6 -; CHECK-NEXT: vadd.f64 d11, d12, d13 -; CHECK-NEXT: vadd.f64 d3, d4, d3 -; CHECK-NEXT: vadd.f64 d2, d5, d0 -; CHECK-NEXT: vadd.f64 d1, d10, d7 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vadd.f64 d0, d11, d8 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vadd.f64 d1, d1, d2 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vadd.f64 d2, d4, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vadd.f64 d4, d7, d4 +; CHECK-NEXT: vadd.f64 d7, d8, d9 +; CHECK-NEXT: vadd.f64 d1, d1, d3 +; CHECK-NEXT: vadd.f64 d0, d2, d0 +; CHECK-NEXT: vadd.f64 d3, d4, d5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vadd.f64 d2, d7, d6 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <12 x double>, <12 x double>* %src, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll index 06c10e0b7bb1a..8ddfb5fb44878 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll @@ -6,18 +6,14 @@ define <16 x i32> *@vld4_v4i32(<16 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vld4_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q4, q2, q3 +; CHECK-NEXT: vadd.i32 q2, q2, q3 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x i32>, <16 x i32>* %src, align 4 @@ -38,18 +34,14 @@ entry: define <32 x i16> *@vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld4_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i16 q4, q2, q3 +; CHECK-NEXT: vadd.i16 q2, q2, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q0, q4 +; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i16>, <32 x i16>* %src, align 4 @@ -70,18 +62,14 @@ entry: define <64 x i8> *@vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) { ; CHECK-LABEL: vld4_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i8 q4, q2, q3 +; CHECK-NEXT: vadd.i8 q2, q2, q3 ; CHECK-NEXT: vadd.i8 q0, q0, q1 -; CHECK-NEXT: vadd.i8 q0, q0, q4 +; CHECK-NEXT: vadd.i8 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i8>, <64 x i8>* %src, align 4 @@ -109,23 +97,19 @@ define <8 x i64> *@vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) { ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 ; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov lr, r12, d5 ; CHECK-NEXT: vldrw.u32 q2, [r0], #64 -; CHECK-NEXT: vmov.f64 d6, d5 +; CHECK-NEXT: vmov r4, r8, d9 +; CHECK-NEXT: vmov.f32 s12, s10 ; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s10, s16 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vmov lr, r12, d3 ; CHECK-NEXT: vmov r2, r7, d1 -; CHECK-NEXT: vmov r4, r8, d7 -; CHECK-NEXT: vmov r3, r6, d5 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vmov r3, r6, d1 ; CHECK-NEXT: adds.w r2, r2, lr ; CHECK-NEXT: adc.w r7, r7, r12 ; CHECK-NEXT: adds r3, r3, r4 @@ -166,18 +150,14 @@ entry: define <16 x float> *@vld4_v4f32(<16 x float> *%src, <4 x float> *%dst) { ; CHECK-LABEL: vld4_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q4, q2, q3 +; CHECK-NEXT: vadd.f32 q2, q2, q3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vadd.f32 q0, q0, q4 +; CHECK-NEXT: vadd.f32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x float>, <16 x float>* %src, align 4 @@ -198,18 +178,14 @@ entry: define <32 x half> *@vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld4_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f16 q4, q2, q3 +; CHECK-NEXT: vadd.f16 q2, q2, q3 ; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x half>, <32 x half>* %src, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll index 74b6b8d7e2843..5058013576343 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -6,17 +6,17 @@ define void @vld4_v2i32(<8 x i32> *%src, <2 x i32> *%dst) { ; CHECK-LABEL: vld4_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s8, s3 +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.f32 s10, s7 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov.f32 s6, s5 ; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s8, s3 +; CHECK-NEXT: vmov.f32 s12, s1 ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: add.w r12, r2, r0 @@ -44,18 +44,14 @@ entry: define void @vld4_v4i32(<16 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vld4_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q4, q2, q3 +; CHECK-NEXT: vadd.i32 q2, q2, q3 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x i32>, <16 x i32>* %src, align 4 @@ -79,7 +75,6 @@ define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) { ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.i32 q4, q2, q3 ; CHECK-NEXT: vadd.i32 q5, q0, q1 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] @@ -88,10 +83,9 @@ define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) { ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q5, q2, q3 +; CHECK-NEXT: vadd.i32 q2, q2, q3 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: vadd.i32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -111,12 +105,10 @@ entry: define void @vld4_v16i32(<64 x i32> *%src, <16 x i32> *%dst) { ; CHECK-LABEL: vld4_v16i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5} -; CHECK-NEXT: push {r4, r5} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #112 -; CHECK-NEXT: sub sp, #112 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: add.w r3, r0, #192 @@ -124,52 +116,40 @@ define void @vld4_v16i32(<64 x i32> *%src, <16 x i32> *%dst) { ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i32 q4, q2, q3 -; CHECK-NEXT: vadd.i32 q6, q0, q1 -; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vadd.i32 q2, q6, q2 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q2, q3, q4 -; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, q2 -; CHECK-NEXT: vadd.i32 q1, q3, q1 -; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vld40.32 {q3, q4, q5, q6}, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 +; CHECK-NEXT: vld41.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vld42.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vld43.32 {q3, q4, q5, q6}, [r3] ; CHECK-NEXT: vadd.i32 q1, q5, q6 ; CHECK-NEXT: vadd.i32 q2, q3, q4 +; CHECK-NEXT: vadd.i32 q0, q2, q1 +; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q2, q2, q3 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q1, q6, q7 +; CHECK-NEXT: vadd.i32 q2, q4, q5 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #112 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5} +; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i32>, <64 x i32>* %src, align 4 @@ -189,25 +169,25 @@ define void @vld4_v4i32_align1(<16 x i32> *%src, <4 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrb.u8 q2, [r0] -; CHECK-NEXT: vldrb.u8 q3, [r0, #16] -; CHECK-NEXT: vldrb.u8 q1, [r0, #32] ; CHECK-NEXT: vldrb.u8 q0, [r0, #48] +; CHECK-NEXT: vldrb.u8 q1, [r0, #32] +; CHECK-NEXT: vldrb.u8 q3, [r0, #16] +; CHECK-NEXT: vldrb.u8 q2, [r0] +; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s16, s11 -; CHECK-NEXT: vmov.f64 d10, d5 +; CHECK-NEXT: vmov.f32 s20, s10 ; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmov.f32 s21, s14 -; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmov.f32 s23, s2 ; CHECK-NEXT: vadd.i32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s20, s9 ; CHECK-NEXT: vmov.f32 s21, s13 -; CHECK-NEXT: vmov.f32 s9, s12 ; CHECK-NEXT: vmov.f32 s22, s5 -; CHECK-NEXT: vmov.f32 s10, s4 ; CHECK-NEXT: vmov.f32 s23, s1 +; CHECK-NEXT: vmov.f32 s9, s12 +; CHECK-NEXT: vmov.f32 s10, s4 ; CHECK-NEXT: vmov.f32 s11, s0 ; CHECK-NEXT: vadd.i32 q0, q2, q5 ; CHECK-NEXT: vadd.i32 q0, q0, q4 @@ -316,18 +296,14 @@ entry: define void @vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld4_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i16 q4, q2, q3 +; CHECK-NEXT: vadd.i16 q2, q2, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q0, q4 +; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i16>, <32 x i16>* %src, align 2 @@ -351,7 +327,6 @@ define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) { ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.i16 q4, q2, q3 ; CHECK-NEXT: vadd.i16 q5, q0, q1 ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] @@ -360,10 +335,9 @@ define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) { ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i16 q5, q2, q3 +; CHECK-NEXT: vadd.i16 q2, q2, q3 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vadd.i16 q0, q0, q5 +; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -383,58 +357,56 @@ entry: define void @vld4_v8i16_align1(<32 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld4_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrb.u8 q1, [r0, #32] ; CHECK-NEXT: vldrb.u8 q2, [r0, #48] -; CHECK-NEXT: vmovx.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s18, s5 +; CHECK-NEXT: vmovx.f16 s0, s7 ; CHECK-NEXT: vins.f16 s18, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 ; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vmovx.f16 s0, s11 ; CHECK-NEXT: vins.f16 s5, s7 ; CHECK-NEXT: vins.f16 s19, s0 ; CHECK-NEXT: vldrb.u8 q0, [r0] ; CHECK-NEXT: vins.f16 s9, s11 -; CHECK-NEXT: vmovx.f16 s24, s6 -; CHECK-NEXT: vmovx.f16 s12, s3 +; CHECK-NEXT: vmov.f32 s22, s5 ; CHECK-NEXT: vmovx.f16 s16, s1 +; CHECK-NEXT: vmovx.f16 s12, s3 ; CHECK-NEXT: vins.f16 s16, s12 ; CHECK-NEXT: vldrb.u8 q3, [r0, #16] ; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vmovx.f16 s20, s15 -; CHECK-NEXT: vmovx.f16 s17, s13 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmov.f32 s22, s5 ; CHECK-NEXT: vmov.f32 s23, s9 +; CHECK-NEXT: vmovx.f16 s17, s13 +; CHECK-NEXT: vmovx.f16 s20, s15 ; CHECK-NEXT: vins.f16 s13, s15 +; CHECK-NEXT: vins.f16 s17, s20 ; CHECK-NEXT: vmov.f32 s20, s1 +; CHECK-NEXT: vmovx.f16 s1, s6 ; CHECK-NEXT: vmov.f32 s21, s13 ; CHECK-NEXT: vadd.i16 q4, q5, q4 ; CHECK-NEXT: vmovx.f16 s22, s4 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s24, s10 +; CHECK-NEXT: vins.f16 s22, s1 ; CHECK-NEXT: vmovx.f16 s23, s8 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmov.f32 s6, s4 -; CHECK-NEXT: vmov.f32 s7, s8 -; CHECK-NEXT: vins.f16 s23, s24 -; CHECK-NEXT: vmovx.f16 s24, s2 +; CHECK-NEXT: vmovx.f16 s1, s10 ; CHECK-NEXT: vmovx.f16 s20, s0 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s14 +; CHECK-NEXT: vins.f16 s23, s1 +; CHECK-NEXT: vmovx.f16 s1, s2 +; CHECK-NEXT: vins.f16 s20, s1 ; CHECK-NEXT: vmovx.f16 s21, s12 -; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s1, s14 +; CHECK-NEXT: vins.f16 s8, s10 +; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vins.f16 s21, s24 +; CHECK-NEXT: vins.f16 s21, s1 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmov.f32 s3, s8 ; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vadd.i16 q0, q0, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x i16>, <32 x i16>* %src, align 1 @@ -608,18 +580,14 @@ entry: define void @vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) { ; CHECK-LABEL: vld4_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.i8 q4, q2, q3 +; CHECK-NEXT: vadd.i8 q2, q2, q3 ; CHECK-NEXT: vadd.i8 q0, q0, q1 -; CHECK-NEXT: vadd.i8 q0, q0, q4 +; CHECK-NEXT: vadd.i8 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x i8>, <64 x i8>* %src, align 1 @@ -646,23 +614,19 @@ define void @vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) { ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 ; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s7, s11 ; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov lr, r12, d5 ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f64 d6, d5 +; CHECK-NEXT: vmov r0, r8, d9 +; CHECK-NEXT: vmov.f32 s12, s10 ; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s10, s16 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vmov lr, r12, d3 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov r0, r8, d7 -; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vmov r5, r6, d1 ; CHECK-NEXT: adds.w r2, r2, lr ; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: vmov r4, r12, d2 @@ -706,54 +670,45 @@ define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d6, d3 -; CHECK-NEXT: vldrw.u32 q6, [r0, #112] +; CHECK-NEXT: vldrw.u32 q6, [r0, #80] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s2, s20 +; CHECK-NEXT: vldrw.u32 q7, [r0, #112] +; CHECK-NEXT: vmov.f32 s3, s21 +; CHECK-NEXT: vmov r3, r2, d11 +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vmov.f32 s0, s26 +; CHECK-NEXT: vmov.f32 s1, s27 +; CHECK-NEXT: vmov lr, r12, d9 +; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s15, s3 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #96] -; CHECK-NEXT: vmov.f64 d4, d11 -; CHECK-NEXT: vmov.f32 s9, s23 -; CHECK-NEXT: vmov r3, r2, d7 -; CHECK-NEXT: vmov r4, r5, d3 -; CHECK-NEXT: vmov.f32 s10, s18 -; CHECK-NEXT: vmov.f32 s11, s19 -; CHECK-NEXT: vmov.f32 s22, s16 -; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s3, s17 ; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmov q7, q5 -; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vmov r0, r6, d15 -; CHECK-NEXT: vmov.f64 d14, d11 -; CHECK-NEXT: vmov.f32 s29, s23 -; CHECK-NEXT: vmov lr, r12, d5 -; CHECK-NEXT: vmov.f32 s30, s26 -; CHECK-NEXT: vmov.f32 s22, s24 -; CHECK-NEXT: vmov.f32 s31, s27 -; CHECK-NEXT: vmov.f32 s23, s25 -; CHECK-NEXT: vmov.f64 d12, d9 +; CHECK-NEXT: vmov.f32 s6, s28 +; CHECK-NEXT: vmov.f32 s7, s29 +; CHECK-NEXT: vmov.f32 s10, s20 +; CHECK-NEXT: vmov.f32 s11, s21 +; CHECK-NEXT: vmov r0, r6, d1 ; CHECK-NEXT: adds r7, r4, r3 +; CHECK-NEXT: vmov r4, r8, d0 ; CHECK-NEXT: adcs r5, r2 -; CHECK-NEXT: vmov r4, r8, d14 -; CHECK-NEXT: vmov r2, r3, d10 -; CHECK-NEXT: vmov.f32 s25, s19 -; CHECK-NEXT: vmov.f32 s26, s2 -; CHECK-NEXT: vmov.f32 s18, s0 -; CHECK-NEXT: vmov.f32 s27, s3 -; CHECK-NEXT: vmov.f32 s19, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: vmov.f32 s0, s18 +; CHECK-NEXT: vmov.f32 s1, s19 ; CHECK-NEXT: adds.w r0, r0, lr ; CHECK-NEXT: adc.w r6, r6, r12 ; CHECK-NEXT: adds.w lr, r0, r7 ; CHECK-NEXT: adc.w r12, r6, r5 -; CHECK-NEXT: vmov r6, r5, d12 +; CHECK-NEXT: vmov r6, r5, d0 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: adds r2, r2, r4 ; CHECK-NEXT: vmov r4, r0, d8 ; CHECK-NEXT: adc.w r3, r3, r8 @@ -762,11 +717,11 @@ define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) { ; CHECK-NEXT: adds.w r9, r6, r2 ; CHECK-NEXT: adc.w r8, r0, r3 ; CHECK-NEXT: vmov r5, r4, d15 -; CHECK-NEXT: vmov r3, r6, d11 -; CHECK-NEXT: vmov r7, r0, d9 +; CHECK-NEXT: vmov r3, r6, d3 +; CHECK-NEXT: vmov r7, r0, d5 ; CHECK-NEXT: adds r3, r3, r5 ; CHECK-NEXT: adcs r6, r4 -; CHECK-NEXT: vmov r5, r4, d13 +; CHECK-NEXT: vmov r5, r4, d11 ; CHECK-NEXT: adds r5, r5, r7 ; CHECK-NEXT: adcs r0, r4 ; CHECK-NEXT: adds r3, r3, r5 @@ -808,11 +763,11 @@ entry: define void @vld4_v2f32(<8 x float> *%src, <2 x float> *%dst) { ; CHECK-LABEL: vld4_v2f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov.f32 s8, s7 -; CHECK-NEXT: vmov.f64 d6, d3 ; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.f32 s13, s2 ; CHECK-NEXT: vadd.f32 q2, q3, q2 ; CHECK-NEXT: vmov.f32 s12, s5 @@ -838,18 +793,14 @@ entry: define void @vld4_v4f32(<16 x float> *%src, <4 x float> *%dst) { ; CHECK-LABEL: vld4_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q4, q2, q3 +; CHECK-NEXT: vadd.f32 q2, q2, q3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vadd.f32 q0, q0, q4 +; CHECK-NEXT: vadd.f32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x float>, <16 x float>* %src, align 4 @@ -873,7 +824,6 @@ define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) { ; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 ; CHECK-NEXT: vadd.f32 q4, q2, q3 ; CHECK-NEXT: vadd.f32 q5, q0, q1 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] @@ -882,10 +832,9 @@ define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) { ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q5, q2, q3 +; CHECK-NEXT: vadd.f32 q2, q2, q3 ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: vadd.f32 q0, q0, q5 +; CHECK-NEXT: vadd.f32 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -905,12 +854,10 @@ entry: define void @vld4_v16f32(<64 x float> *%src, <16 x float> *%dst) { ; CHECK-LABEL: vld4_v16f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5} -; CHECK-NEXT: push {r4, r5} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #112 -; CHECK-NEXT: sub sp, #112 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: add.w r3, r0, #192 @@ -918,52 +865,40 @@ define void @vld4_v16f32(<64 x float> *%src, <16 x float> *%dst) { ; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: adds r0, #128 ; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f32 q4, q2, q3 -; CHECK-NEXT: vadd.f32 q6, q0, q1 -; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] -; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 q0, q1, q0 -; CHECK-NEXT: vadd.f32 q2, q6, q2 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vadd.f32 q2, q3, q4 -; CHECK-NEXT: vadd.f32 q0, q0, q2 -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] -; CHECK-NEXT: vstrw.32 q4, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 q0, q0, q2 -; CHECK-NEXT: vadd.f32 q1, q3, q1 -; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload +; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vld40.32 {q3, q4, q5, q6}, [r3] ; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 +; CHECK-NEXT: vld41.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vadd.f32 q0, q0, q2 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vld42.32 {q3, q4, q5, q6}, [r3] +; CHECK-NEXT: vld43.32 {q3, q4, q5, q6}, [r3] ; CHECK-NEXT: vadd.f32 q1, q5, q6 ; CHECK-NEXT: vadd.f32 q2, q3, q4 +; CHECK-NEXT: vadd.f32 q0, q2, q1 +; CHECK-NEXT: vld40.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.32 {q4, q5, q6, q7}, [r2] +; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vadd.f32 q2, q2, q3 +; CHECK-NEXT: vadd.f32 q0, q0, q2 +; CHECK-NEXT: vadd.f32 q1, q6, q7 +; CHECK-NEXT: vadd.f32 q2, q4, q5 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vadd.f32 q1, q2, q1 -; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vstrw.32 q2, [r1, #48] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #112 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5} +; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x float>, <64 x float>* %src, align 4 @@ -983,25 +918,25 @@ define void @vld4_v4f32_align1(<16 x float> *%src, <4 x float> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrb.u8 q2, [r0] -; CHECK-NEXT: vldrb.u8 q3, [r0, #16] -; CHECK-NEXT: vldrb.u8 q1, [r0, #32] ; CHECK-NEXT: vldrb.u8 q0, [r0, #48] +; CHECK-NEXT: vldrb.u8 q1, [r0, #32] +; CHECK-NEXT: vldrb.u8 q3, [r0, #16] +; CHECK-NEXT: vldrb.u8 q2, [r0] +; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s16, s11 -; CHECK-NEXT: vmov.f64 d10, d5 +; CHECK-NEXT: vmov.f32 s20, s10 ; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmov.f32 s21, s14 -; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmov.f32 s23, s2 ; CHECK-NEXT: vadd.f32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s20, s9 ; CHECK-NEXT: vmov.f32 s21, s13 -; CHECK-NEXT: vmov.f32 s9, s12 ; CHECK-NEXT: vmov.f32 s22, s5 -; CHECK-NEXT: vmov.f32 s10, s4 ; CHECK-NEXT: vmov.f32 s23, s1 +; CHECK-NEXT: vmov.f32 s9, s12 +; CHECK-NEXT: vmov.f32 s10, s4 ; CHECK-NEXT: vmov.f32 s11, s0 ; CHECK-NEXT: vadd.f32 q0, q2, q5 ; CHECK-NEXT: vadd.f32 q0, q0, q4 @@ -1027,17 +962,17 @@ define void @vld4_v2f16(<8 x half> *%src, <2 x half> *%dst) { ; CHECK-LABEL: vld4_v2f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vmovx.f16 s4, s2 ; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vins.f16 s12, s4 -; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vadd.f16 q0, q0, q3 +; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vadd.f16 q1, q1, q2 +; CHECK-NEXT: vadd.f16 q0, q0, q3 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] @@ -1058,27 +993,27 @@ entry: define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) { ; CHECK-LABEL: vld4_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vins.f16 s4, s8 ; CHECK-NEXT: vldrh.u16 q2, [r0, #16] +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s6, s2 ; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s16, s3 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vmovx.f16 s5, s8 -; CHECK-NEXT: vins.f16 s5, s12 ; CHECK-NEXT: vmovx.f16 s12, s1 -; CHECK-NEXT: vins.f16 s12, s16 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s16, s11 +; CHECK-NEXT: vmovx.f16 s2, s3 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s5, s8 +; CHECK-NEXT: vmovx.f16 s6, s10 +; CHECK-NEXT: vins.f16 s12, s2 ; CHECK-NEXT: vmovx.f16 s13, s9 +; CHECK-NEXT: vmovx.f16 s2, s11 ; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vins.f16 s13, s16 ; CHECK-NEXT: vins.f16 s9, s11 +; CHECK-NEXT: vins.f16 s8, s10 ; CHECK-NEXT: vmov.f32 s16, s1 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vins.f16 s13, s2 ; CHECK-NEXT: vmov.f32 s1, s8 ; CHECK-NEXT: vmov.f32 s17, s9 ; CHECK-NEXT: vadd.f16 q0, q0, q1 @@ -1086,7 +1021,7 @@ define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) { ; CHECK-NEXT: vadd.f16 q0, q0, q3 ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: strd r0, r2, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <16 x half>, <16 x half>* %src, align 2 @@ -1104,18 +1039,14 @@ entry: define void @vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld4_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 -; CHECK-NEXT: vadd.f16 q4, q2, q3 +; CHECK-NEXT: vadd.f16 q2, q2, q3 ; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %l1 = load <32 x half>, <32 x half>* %src, align 2 @@ -1133,37 +1064,25 @@ entry: define void @vld4_v16f16(<64 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld4_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5} -; CHECK-NEXT: push {r4, r5} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #80 -; CHECK-NEXT: sub sp, #80 ; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! ; CHECK-NEXT: vld40.16 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vadd.f16 q2, q2, q3 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vld41.16 {q4, q5, q6, q7}, [r0] +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vld42.16 {q4, q5, q6, q7}, [r0] ; CHECK-NEXT: vld43.16 {q4, q5, q6, q7}, [r0] -; CHECK-NEXT: @ kill: def $q4 killed $q4 killed $q4_q5_q6_q7 -; CHECK-NEXT: vadd.f16 q0, q6, q7 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vadd.f16 q6, q6, q7 ; CHECK-NEXT: vadd.f16 q4, q4, q5 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.f16 q4, q4, q0 -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload -; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 +; CHECK-NEXT: vadd.f16 q4, q4, q6 ; CHECK-NEXT: vstrw.32 q4, [r1, #16] -; CHECK-NEXT: vadd.f16 q4, q2, q3 -; CHECK-NEXT: vadd.f16 q0, q0, q1 -; CHECK-NEXT: vadd.f16 q0, q0, q4 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #80 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5} ; CHECK-NEXT: bx lr entry: %l1 = load <64 x half>, <64 x half>* %src, align 2 @@ -1185,48 +1104,48 @@ define void @vld4_v8f16_align1(<32 x half> *%src, <8 x half> *%dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] ; CHECK-NEXT: vldrb.u8 q2, [r0, #48] -; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vmovx.f16 s18, s1 +; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vins.f16 s18, s4 -; CHECK-NEXT: vmovx.f16 s4, s11 ; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vmovx.f16 s4, s11 ; CHECK-NEXT: vins.f16 s1, s3 ; CHECK-NEXT: vins.f16 s19, s4 ; CHECK-NEXT: vldrb.u8 q1, [r0] -; CHECK-NEXT: vmovx.f16 s24, s2 -; CHECK-NEXT: vins.f16 s9, s11 -; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vmovx.f16 s22, s0 +; CHECK-NEXT: vmovx.f16 s3, s2 ; CHECK-NEXT: vmovx.f16 s16, s5 +; CHECK-NEXT: vmovx.f16 s12, s7 ; CHECK-NEXT: vins.f16 s16, s12 ; CHECK-NEXT: vldrb.u8 q3, [r0, #16] -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vins.f16 s22, s3 +; CHECK-NEXT: vmovx.f16 s23, s8 ; CHECK-NEXT: vmovx.f16 s17, s13 +; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmovx.f16 s3, s10 ; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmovx.f16 s22, s0 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vmovx.f16 s24, s10 -; CHECK-NEXT: vmovx.f16 s23, s8 -; CHECK-NEXT: vins.f16 s13, s15 -; CHECK-NEXT: vins.f16 s23, s24 -; CHECK-NEXT: vmovx.f16 s24, s6 +; CHECK-NEXT: vins.f16 s23, s3 ; CHECK-NEXT: vmovx.f16 s20, s4 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vins.f16 s20, s24 -; CHECK-NEXT: vmovx.f16 s24, s14 +; CHECK-NEXT: vmovx.f16 s3, s6 +; CHECK-NEXT: vins.f16 s9, s11 +; CHECK-NEXT: vins.f16 s5, s7 +; CHECK-NEXT: vins.f16 s13, s15 +; CHECK-NEXT: vins.f16 s20, s3 ; CHECK-NEXT: vmovx.f16 s21, s12 +; CHECK-NEXT: vmovx.f16 s3, s14 ; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vins.f16 s21, s24 -; CHECK-NEXT: vmov.f32 s26, s1 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vins.f16 s12, s14 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmov.f32 s27, s9 ; CHECK-NEXT: vmov.f32 s24, s5 -; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vins.f16 s21, s3 +; CHECK-NEXT: vmov.f32 s26, s1 +; CHECK-NEXT: vmov.f32 s27, s9 +; CHECK-NEXT: vmov.f32 s25, s13 ; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vadd.f16 q4, q6, q4 ; CHECK-NEXT: vmov.f32 s7, s8 -; CHECK-NEXT: vmov.f32 s25, s13 ; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vadd.f16 q4, q6, q4 ; CHECK-NEXT: vadd.f16 q0, q1, q5 ; CHECK-NEXT: vadd.f16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll index d26757fc99e89..930212ddc59c0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll @@ -30,9 +30,9 @@ define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* noc ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.16 {q0, q1}, [r0] ; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! -; CHECK-NEXT: vmul.f16 q2, q0, q0 -; CHECK-NEXT: vfma.f16 q2, q1, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vmul.f16 q0, q0, q0 +; CHECK-NEXT: vfma.f16 q0, q1, q1 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB0_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r4, r2 @@ -159,9 +159,9 @@ define void @arm_cmplx_mag_squared_f32(float* nocapture readonly %pSrc, float* n ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] ; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! -; CHECK-NEXT: vmul.f32 q2, q0, q0 -; CHECK-NEXT: vfma.f32 q2, q1, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vmul.f32 q0, q0, q0 +; CHECK-NEXT: vfma.f32 q0, q1, q1 +; CHECK-NEXT: vstrb.8 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r4, r2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll index c073ffbe4a42e..e69d06d475300 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -6,125 +6,119 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: mul r12, r3, r2 ; CHECK-NEXT: lsrs.w r2, r12, #2 ; CHECK-NEXT: beq.w .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 -; CHECK-NEXT: ldr r2, [sp, #88] +; CHECK-NEXT: ldr r2, [sp, #56] ; CHECK-NEXT: and.w r3, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q4, [r0, #32] -; CHECK-NEXT: vldrh.u16 q5, [r0, #48] +; CHECK-NEXT: vldrh.u16 q1, [r0, #32] +; CHECK-NEXT: vldrh.u16 q4, [r0, #48] ; CHECK-NEXT: vldrh.u16 q3, [r0], #64 -; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmovx.f16 s26, s4 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vldrh.u16 q5, [r0, #-48] +; CHECK-NEXT: vmovx.f16 s27, s16 +; CHECK-NEXT: vins.f16 s26, s6 +; CHECK-NEXT: vmovx.f16 s6, s18 +; CHECK-NEXT: vmovx.f16 s8, s7 +; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vmovx.f16 s24, s12 +; CHECK-NEXT: vins.f16 s10, s8 +; CHECK-NEXT: vins.f16 s27, s6 +; CHECK-NEXT: vmovx.f16 s6, s14 ; CHECK-NEXT: vmovx.f16 s8, s19 -; CHECK-NEXT: vldrh.u16 q6, [r0, #-48] -; CHECK-NEXT: vins.f16 s2, s19 -; CHECK-NEXT: vmov.f32 s3, s21 -; CHECK-NEXT: vmovx.f16 s5, s25 -; CHECK-NEXT: vins.f16 s3, s23 -; CHECK-NEXT: vmovx.f16 s6, s17 +; CHECK-NEXT: vmovx.f16 s11, s17 ; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vins.f16 s6, s8 -; CHECK-NEXT: vmovx.f16 s8, s23 -; CHECK-NEXT: vmovx.f16 s7, s21 -; CHECK-NEXT: vins.f16 s0, s15 -; CHECK-NEXT: vins.f16 s7, s8 -; CHECK-NEXT: vmovx.f16 s8, s15 -; CHECK-NEXT: vmovx.f16 s4, s13 -; CHECK-NEXT: vins.f16 s25, s27 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s8, s27 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vmov.f32 s1, s25 -; CHECK-NEXT: vmul.f16 q2, q1, r2 -; CHECK-NEXT: vmul.f16 q0, q0, r2 -; CHECK-NEXT: vmovx.f16 s7, s0 -; CHECK-NEXT: vmovx.f16 s28, s8 -; CHECK-NEXT: vins.f16 s7, s28 -; CHECK-NEXT: vmovx.f16 s30, s16 -; CHECK-NEXT: vmovx.f16 s31, s20 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s28, s12 -; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vmovx.f16 s29, s24 -; CHECK-NEXT: vmovx.f16 s4, s18 +; CHECK-NEXT: vins.f16 s11, s8 +; CHECK-NEXT: vmovx.f16 s25, s20 +; CHECK-NEXT: vins.f16 s24, s6 +; CHECK-NEXT: vmovx.f16 s6, s22 +; CHECK-NEXT: vmovx.f16 s1, s15 +; CHECK-NEXT: vmovx.f16 s8, s13 ; CHECK-NEXT: vins.f16 s20, s22 -; CHECK-NEXT: vins.f16 s30, s4 -; CHECK-NEXT: vmovx.f16 s4, s22 +; CHECK-NEXT: vins.f16 s16, s18 +; CHECK-NEXT: vins.f16 s25, s6 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vins.f16 s0, s15 +; CHECK-NEXT: vmovx.f16 s9, s21 +; CHECK-NEXT: vins.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s1, s23 ; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vins.f16 s31, s4 -; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vmov.f32 s14, s16 -; CHECK-NEXT: vins.f16 s24, s26 -; CHECK-NEXT: vmov.f32 s15, s20 -; CHECK-NEXT: vins.f16 s28, s4 -; CHECK-NEXT: vmovx.f16 s4, s26 -; CHECK-NEXT: vmov.f32 s13, s24 -; CHECK-NEXT: vins.f16 s29, s4 +; CHECK-NEXT: vins.f16 s21, s23 +; CHECK-NEXT: vmov.f32 s14, s4 +; CHECK-NEXT: vmov.f32 s15, s16 +; CHECK-NEXT: vins.f16 s9, s1 +; CHECK-NEXT: vmov.f32 s13, s20 +; CHECK-NEXT: vmul.f16 q6, q6, r2 ; CHECK-NEXT: vmul.f16 q3, q3, r2 -; CHECK-NEXT: vmul.f16 q7, q7, r2 +; CHECK-NEXT: vins.f16 s2, s7 +; CHECK-NEXT: vins.f16 s3, s19 +; CHECK-NEXT: vmov.f32 s1, s21 +; CHECK-NEXT: vmul.f16 q0, q0, r2 ; CHECK-NEXT: vmovx.f16 s4, s12 -; CHECK-NEXT: vmovx.f16 s6, s28 +; CHECK-NEXT: vmovx.f16 s6, s24 +; CHECK-NEXT: vmul.f16 q2, q2, r2 +; CHECK-NEXT: vmovx.f16 s7, s0 ; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vmovx.f16 s8, s8 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s5, s9 -; CHECK-NEXT: vins.f16 s12, s28 -; CHECK-NEXT: vins.f16 s6, s5 -; CHECK-NEXT: vmovx.f16 s18, s13 -; CHECK-NEXT: vmovx.f16 s5, s29 -; CHECK-NEXT: vins.f16 s1, s9 -; CHECK-NEXT: vins.f16 s18, s5 -; CHECK-NEXT: vmovx.f16 s23, s2 -; CHECK-NEXT: vmovx.f16 s5, s10 -; CHECK-NEXT: vins.f16 s2, s10 -; CHECK-NEXT: vins.f16 s23, s5 -; CHECK-NEXT: vins.f16 s13, s29 -; CHECK-NEXT: vmovx.f16 s27, s3 +; CHECK-NEXT: vmovx.f16 s5, s1 +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: vins.f16 s7, s8 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmovx.f16 s6, s13 +; CHECK-NEXT: vmovx.f16 s8, s25 +; CHECK-NEXT: vins.f16 s6, s8 +; CHECK-NEXT: vmovx.f16 s19, s2 +; CHECK-NEXT: vmovx.f16 s8, s10 +; CHECK-NEXT: vmovx.f16 s18, s14 +; CHECK-NEXT: vins.f16 s19, s8 +; CHECK-NEXT: vmovx.f16 s8, s26 +; CHECK-NEXT: vins.f16 s18, s8 +; CHECK-NEXT: vmovx.f16 s23, s3 ; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmovx.f16 s22, s14 -; CHECK-NEXT: vins.f16 s27, s8 -; CHECK-NEXT: vins.f16 s14, s30 -; CHECK-NEXT: vmovx.f16 s26, s15 -; CHECK-NEXT: vins.f16 s15, s31 -; CHECK-NEXT: vmovx.f16 s8, s31 +; CHECK-NEXT: vins.f16 s14, s26 +; CHECK-NEXT: vins.f16 s23, s8 +; CHECK-NEXT: vmovx.f16 s22, s15 +; CHECK-NEXT: vins.f16 s15, s27 +; CHECK-NEXT: vmovx.f16 s8, s27 +; CHECK-NEXT: vins.f16 s12, s24 +; CHECK-NEXT: vins.f16 s13, s25 ; CHECK-NEXT: vins.f16 s3, s11 -; CHECK-NEXT: vins.f16 s26, s8 +; CHECK-NEXT: vins.f16 s1, s9 +; CHECK-NEXT: vins.f16 s2, s10 +; CHECK-NEXT: vins.f16 s22, s8 ; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vmovx.f16 s5, s30 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s17, s0 ; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov.f32 s29, s0 -; CHECK-NEXT: vins.f16 s22, s5 +; CHECK-NEXT: vmov q6, q0 +; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vmov.f32 s11, s31 -; CHECK-NEXT: vmov q7, q0 -; CHECK-NEXT: vmov.f32 s31, s6 -; CHECK-NEXT: vmov.f32 s16, s13 -; CHECK-NEXT: vmov.f32 s21, s2 -; CHECK-NEXT: vmov.f32 s25, s3 -; CHECK-NEXT: vmov.f32 s17, s29 -; CHECK-NEXT: vmov.f32 s20, s14 -; CHECK-NEXT: vmov.f32 s24, s15 -; CHECK-NEXT: vstrh.16 q5, [r1, #32] -; CHECK-NEXT: vstrh.16 q6, [r1, #48] +; CHECK-NEXT: vmov.f32 s17, s2 +; CHECK-NEXT: vmov.f32 s16, s14 +; CHECK-NEXT: vmov.f32 s21, s3 +; CHECK-NEXT: vstrh.16 q4, [r1, #32] +; CHECK-NEXT: vmov.f32 s20, s15 +; CHECK-NEXT: vmov.f32 s7, s5 +; CHECK-NEXT: vstrh.16 q5, [r1, #48] ; CHECK-NEXT: vstrh.16 q2, [r1], #64 -; CHECK-NEXT: vmov.f32 s19, s31 -; CHECK-NEXT: vstrh.16 q4, [r1, #-48] +; CHECK-NEXT: vmov.f32 s4, s13 +; CHECK-NEXT: vmov.f32 s5, s25 +; CHECK-NEXT: vstrh.16 q1, [r1, #-48] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r7, pc} entry: %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll index 9b0bc7e72516c..f2d9593f26418 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll @@ -176,8 +176,8 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: vmovn64_b2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -185,8 +185,8 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2) { ; CHECKBE-LABEL: vmovn64_b2: ; CHECKBE: @ %bb.0: @ %entry ; CHECKBE-NEXT: vmov.f32 s4, s6 -; CHECKBE-NEXT: vmov.f32 s5, s7 ; CHECKBE-NEXT: vmov.f32 s6, s0 +; CHECKBE-NEXT: vmov.f32 s5, s7 ; CHECKBE-NEXT: vmov.f32 s7, s1 ; CHECKBE-NEXT: vmov q0, q1 ; CHECKBE-NEXT: bx lr @@ -199,16 +199,16 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: vmovn64_b3: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn64_b3: ; CHECKBE: @ %bb.0: @ %entry ; CHECKBE-NEXT: vmov.f32 s0, s2 -; CHECKBE-NEXT: vmov.f32 s1, s3 ; CHECKBE-NEXT: vmov.f32 s2, s4 +; CHECKBE-NEXT: vmov.f32 s1, s3 ; CHECKBE-NEXT: vmov.f32 s3, s5 ; CHECKBE-NEXT: bx lr entry: @@ -301,11 +301,11 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmovn32_b2(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vmovn32_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmov.f32 s7, s2 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn32_b2: @@ -326,22 +326,21 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmovn32_b3(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vmovn32_b3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn32_b3: ; CHECKBE: @ %bb.0: @ %entry ; CHECKBE-NEXT: vrev64.32 q2, q1 ; CHECKBE-NEXT: vrev64.32 q1, q0 -; CHECKBE-NEXT: vmov.f32 s12, s5 -; CHECKBE-NEXT: vmov.f32 s13, s8 -; CHECKBE-NEXT: vmov.f32 s14, s7 -; CHECKBE-NEXT: vmov.f32 s15, s10 -; CHECKBE-NEXT: vrev64.32 q0, q3 +; CHECKBE-NEXT: vmov.f32 s4, s5 +; CHECKBE-NEXT: vmov.f32 s6, s7 +; CHECKBE-NEXT: vmov.f32 s5, s8 +; CHECKBE-NEXT: vmov.f32 s7, s10 +; CHECKBE-NEXT: vrev64.32 q0, q1 ; CHECKBE-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> @@ -450,15 +449,15 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s9, s5 -; CHECK-NEXT: vins.f16 s9, s1 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s10, s6 -; CHECK-NEXT: vins.f16 s10, s2 -; CHECK-NEXT: vmovx.f16 s11, s7 -; CHECK-NEXT: vins.f16 s11, s3 -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmovx.f16 s5, s5 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vmovx.f16 s7, s7 +; CHECK-NEXT: vins.f16 s5, s1 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vins.f16 s7, s3 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn16_b2: @@ -466,12 +465,12 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) { ; CHECKBE-NEXT: vrev64.16 q2, q0 ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: vmovx.f16 s5, s1 -; CHECKBE-NEXT: vins.f16 s5, s9 ; CHECKBE-NEXT: vmovx.f16 s4, s0 -; CHECKBE-NEXT: vins.f16 s4, s8 ; CHECKBE-NEXT: vmovx.f16 s6, s2 -; CHECKBE-NEXT: vins.f16 s6, s10 ; CHECKBE-NEXT: vmovx.f16 s7, s3 +; CHECKBE-NEXT: vins.f16 s5, s9 +; CHECKBE-NEXT: vins.f16 s4, s8 +; CHECKBE-NEXT: vins.f16 s6, s10 ; CHECKBE-NEXT: vins.f16 s7, s11 ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr @@ -483,28 +482,27 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_b3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmovx.f16 s1, s9 +; CHECK-NEXT: vmovx.f16 s1, s1 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vmovx.f16 s3, s3 ; CHECK-NEXT: vins.f16 s1, s5 -; CHECK-NEXT: vmovx.f16 s0, s8 ; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmovx.f16 s2, s10 ; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s3, s11 ; CHECK-NEXT: vins.f16 s3, s7 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn16_b3: ; CHECKBE: @ %bb.0: @ %entry -; CHECKBE-NEXT: vrev64.16 q3, q0 ; CHECKBE-NEXT: vrev64.16 q2, q1 -; CHECKBE-NEXT: vmovx.f16 s5, s13 +; CHECKBE-NEXT: vrev64.16 q1, q0 +; CHECKBE-NEXT: vmovx.f16 s5, s5 +; CHECKBE-NEXT: vmovx.f16 s4, s4 +; CHECKBE-NEXT: vmovx.f16 s6, s6 +; CHECKBE-NEXT: vmovx.f16 s7, s7 ; CHECKBE-NEXT: vins.f16 s5, s9 -; CHECKBE-NEXT: vmovx.f16 s4, s12 ; CHECKBE-NEXT: vins.f16 s4, s8 -; CHECKBE-NEXT: vmovx.f16 s6, s14 ; CHECKBE-NEXT: vins.f16 s6, s10 -; CHECKBE-NEXT: vmovx.f16 s7, s15 ; CHECKBE-NEXT: vins.f16 s7, s11 ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll index e4ae6e5dff3a2..b8ddde719a67e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll @@ -148,11 +148,11 @@ entry: define arm_aapcs_vfpcc void @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) { ; CHECK-LABEL: vmovn64_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> @@ -163,11 +163,11 @@ entry: define arm_aapcs_vfpcc void @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) { ; CHECK-LABEL: vmovn64_b3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov.f32 s11, s5 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> @@ -232,11 +232,11 @@ entry: define arm_aapcs_vfpcc void @vmovn32_b2(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) { ; CHECK-LABEL: vmovn32_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmov.f32 s7, s2 +; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> @@ -247,11 +247,11 @@ entry: define arm_aapcs_vfpcc void @vmovn32_b3(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) { ; CHECK-LABEL: vmovn32_b3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> @@ -314,15 +314,15 @@ entry: define arm_aapcs_vfpcc void @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) { ; CHECK-LABEL: vmovn16_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s9, s5 -; CHECK-NEXT: vins.f16 s9, s1 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s10, s6 -; CHECK-NEXT: vins.f16 s10, s2 -; CHECK-NEXT: vmovx.f16 s11, s7 -; CHECK-NEXT: vins.f16 s11, s3 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vmovx.f16 s5, s5 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vmovx.f16 s7, s7 +; CHECK-NEXT: vins.f16 s5, s1 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vins.f16 s7, s3 +; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> @@ -333,15 +333,15 @@ entry: define arm_aapcs_vfpcc void @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) { ; CHECK-LABEL: vmovn16_b3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s9, s1 -; CHECK-NEXT: vins.f16 s9, s5 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vmovx.f16 s10, s2 -; CHECK-NEXT: vins.f16 s10, s6 -; CHECK-NEXT: vmovx.f16 s11, s3 -; CHECK-NEXT: vins.f16 s11, s7 -; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vmovx.f16 s1, s1 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vmovx.f16 s3, s3 +; CHECK-NEXT: vins.f16 s1, s5 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vins.f16 s2, s6 +; CHECK-NEXT: vins.f16 s3, s7 +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll index 7e2374f2885f5..f66eb8584a0bd 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll @@ -190,15 +190,12 @@ entry: define arm_aapcs_vfpcc <4 x i64> @sext32_0213_0ext(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_0213_0ext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s16, s1 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r0 -; CHECK-NEXT: vmov.f32 s18, s3 ; CHECK-NEXT: vmullb.s32 q2, q0, q3 -; CHECK-NEXT: vmullb.s32 q1, q4, q3 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmullb.s32 q1, q0, q3 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> @@ -213,15 +210,12 @@ entry: define arm_aapcs_vfpcc <4 x i64> @sext32_0ext_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_0ext_0213: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s16, s1 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r0 -; CHECK-NEXT: vmov.f32 s18, s3 ; CHECK-NEXT: vmullb.s32 q2, q3, q0 -; CHECK-NEXT: vmullb.s32 q1, q3, q4 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmullb.s32 q1, q3, q0 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> @@ -241,8 +235,8 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) { ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s6, s7 ; CHECK-NEXT: umull lr, r12, r1, r0 ; CHECK-NEXT: umull r2, r5, r3, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr @@ -252,10 +246,10 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) { ; CHECK-NEXT: mla r5, r3, r2, r5 ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: mla r1, r1, r0, r4 -; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: mla r3, r3, r0, r5 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: umull r5, lr, r4, r0 ; CHECK-NEXT: umull r3, r12, r1, r0 ; CHECK-NEXT: vmov q1[2], q1[0], r5, r3 @@ -286,8 +280,8 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-NEXT: asrs r4, r0, #31 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s6, s7 ; CHECK-NEXT: umull lr, r12, r0, r1 ; CHECK-NEXT: umull r2, r5, r0, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr @@ -296,10 +290,10 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-NEXT: mla r1, r4, r1, r2 ; CHECK-NEXT: asrs r2, r3, #31 ; CHECK-NEXT: mla r2, r0, r2, r5 -; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vmov r5, s4 ; CHECK-NEXT: mla r2, r4, r3, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 -; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: umull r3, lr, r0, r5 ; CHECK-NEXT: umull r2, r12, r0, r1 ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 @@ -474,15 +468,12 @@ entry: define arm_aapcs_vfpcc <4 x i64> @zext32_0213_0ext(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: zext32_0213_0ext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s16, s1 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r0 -; CHECK-NEXT: vmov.f32 s18, s3 ; CHECK-NEXT: vmullb.u32 q2, q0, q3 -; CHECK-NEXT: vmullb.u32 q1, q4, q3 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmullb.u32 q1, q0, q3 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> @@ -497,15 +488,12 @@ entry: define arm_aapcs_vfpcc <4 x i64> @zext32_0ext_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: zext32_0ext_0213: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s16, s1 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r0 -; CHECK-NEXT: vmov.f32 s18, s3 ; CHECK-NEXT: vmullb.u32 q2, q3, q0 -; CHECK-NEXT: vmullb.u32 q1, q3, q4 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmullb.u32 q1, q3, q0 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> @@ -522,13 +510,13 @@ define arm_aapcs_vfpcc <4 x i64> @zext32_0213_ext0(<8 x i32> %src1, i32 %src2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: umull r1, r12, r1, r0 ; CHECK-NEXT: umull r3, r2, r3, r0 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r12 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: umull r1, r2, r1, r0 @@ -551,13 +539,13 @@ define arm_aapcs_vfpcc <4 x i64> @zext32_ext0_0213(<8 x i32> %src1, i32 %src2) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: umull r1, r12, r0, r1 ; CHECK-NEXT: umull r3, r2, r0, r3 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r12 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: umull r1, r2, r0, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll index ebaeae88af718..eafbf41bc6241 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll @@ -72,16 +72,16 @@ entry: define <4 x i64> *@vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) { ; CHECK-LABEL: vst2_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: add.w r0, r1, #32 -; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov.f32 s9, s3 ; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s10, s6 ; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmov.f32 s10, s6 ; CHECK-NEXT: vstrb.8 q0, [r1], #16 +; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: bx lr entry: @@ -144,11 +144,11 @@ entry: define <4 x double> *@vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) { ; CHECK-LABEL: vst2_v2f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d2 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov.f64 d5, d0 ; CHECK-NEXT: vmov.f64 d0, d3 +; CHECK-NEXT: vmov.f64 d4, d2 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q2, [r1], #32 ; CHECK-NEXT: mov r0, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll index 67a606396127e..c749b36416f66 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -44,8 +44,8 @@ define void @vst2_v8i32(<8 x i32> *%src, <16 x i32> *%dst) { ; CHECK-LABEL: vst2_v8i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vst20.32 {q0, q1}, [r1] ; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! @@ -68,10 +68,10 @@ define void @vst2_v16i32(<16 x i32> *%src, <32 x i32> *%dst) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q7, [r0, #64] +; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #112] ; CHECK-NEXT: vldrw.u32 q3, [r0, #96] ; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q4, [r0, #16] @@ -100,17 +100,17 @@ entry: define void @vst2_v4i32_align1(<4 x i32> *%src, <8 x i32> *%dst) { ; CHECK-LABEL: vst2_v4i32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d3 -; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s5 ; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s15, s1 +; CHECK-NEXT: vmov.f32 s12, s4 ; CHECK-NEXT: vstrb.8 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov.f32 s15, s1 ; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr entry: @@ -186,8 +186,8 @@ define void @vst2_v16i16(<16 x i16> *%src, <32 x i16> *%dst) { ; CHECK-LABEL: vst2_v16i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vst20.16 {q0, q1}, [r1] ; CHECK-NEXT: vst21.16 {q0, q1}, [r1]! @@ -207,34 +207,31 @@ entry: define void @vst2_v8i16_align1(<8 x i16> *%src, <16 x i16> *%dst) { ; CHECK-LABEL: vst2_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vmovx.f16 s1, s10 -; CHECK-NEXT: vmovx.f16 s12, s6 -; CHECK-NEXT: vins.f16 s1, s12 +; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vins.f16 s10, s6 -; CHECK-NEXT: vmov.f32 s0, s10 -; CHECK-NEXT: vmovx.f16 s12, s7 ; CHECK-NEXT: vmovx.f16 s3, s11 +; CHECK-NEXT: vmovx.f16 s6, s7 ; CHECK-NEXT: vins.f16 s11, s7 -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmovx.f16 s14, s4 -; CHECK-NEXT: vins.f16 s3, s12 -; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vins.f16 s3, s6 +; CHECK-NEXT: vmovx.f16 s6, s8 ; CHECK-NEXT: vins.f16 s8, s4 -; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vins.f16 s6, s4 +; CHECK-NEXT: vmovx.f16 s15, s9 +; CHECK-NEXT: vins.f16 s9, s5 ; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vmov.f32 s0, s10 +; CHECK-NEXT: vins.f16 s15, s4 +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vmov.f32 s13, s6 ; CHECK-NEXT: vstrb.8 q0, [r1, #16] -; CHECK-NEXT: vmovx.f16 s19, s9 -; CHECK-NEXT: vins.f16 s9, s5 -; CHECK-NEXT: vmov.f32 s18, s9 -; CHECK-NEXT: vins.f16 s19, s4 -; CHECK-NEXT: vstrb.8 q4, [r1] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov.f32 s14, s9 +; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -328,12 +325,12 @@ entry: define void @vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) { ; CHECK-LABEL: vst2_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s5 ; CHECK-NEXT: vmov.f32 s0, s6 ; CHECK-NEXT: vstrb.8 q2, [r1], #16 ; CHECK-NEXT: vmov.f32 s1, s7 @@ -354,25 +351,25 @@ define void @vst2_v4i64(<4 x i64> *%src, <8 x i64> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f64 d6, d1 -; CHECK-NEXT: vmov.f64 d10, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f32 s12, s2 ; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vmov.f32 s20, s6 ; CHECK-NEXT: vmov.f32 s21, s7 ; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s22, s10 ; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vmov.f32 s6, s8 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vmov.f32 s14, s18 ; CHECK-NEXT: vstrb.8 q1, [r1], #48 -; CHECK-NEXT: vmov.f32 s23, s11 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.f32 s22, s10 ; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vmov.f32 s23, s11 ; CHECK-NEXT: vstrw.32 q5, [r1, #-32] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -429,8 +426,8 @@ define void @vst2_v8f32(<8 x float> *%src, <16 x float> *%dst) { ; CHECK-LABEL: vst2_v8f32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vst20.32 {q0, q1}, [r1] ; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! @@ -453,10 +450,10 @@ define void @vst2_v16f32(<16 x float> *%src, <32 x float> *%dst) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q7, [r0, #64] +; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #112] ; CHECK-NEXT: vldrw.u32 q3, [r0, #96] ; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q4, [r0, #16] @@ -485,17 +482,17 @@ entry: define void @vst2_v4f32_align1(<4 x float> *%src, <8 x float> *%dst) { ; CHECK-LABEL: vst2_v4f32_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d3 -; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s13, s0 ; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s5 ; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s15, s1 +; CHECK-NEXT: vmov.f32 s12, s4 ; CHECK-NEXT: vstrb.8 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov.f32 s15, s1 ; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr entry: @@ -543,19 +540,19 @@ define void @vst2_v4f16(<4 x half> *%src, <8 x half> *%dst) { ; CHECK-NEXT: vmov.32 q1[0], r3 ; CHECK-NEXT: vmov.32 q0[1], r12 ; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s10, s4 +; CHECK-NEXT: vmovx.f16 s2, s0 ; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vins.f16 s1, s5 -; CHECK-NEXT: vins.f16 s10, s4 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.f32 s5, s8 -; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vstrh.16 q1, [r1] +; CHECK-NEXT: vmovx.f16 s6, s5 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmov.f32 s9, s2 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s11, s4 +; CHECK-NEXT: vstrh.16 q2, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 @@ -589,8 +586,8 @@ define void @vst2_v16f16(<16 x half> *%src, <32 x half> *%dst) { ; CHECK-LABEL: vst2_v16f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vst20.16 {q2, q3}, [r1] ; CHECK-NEXT: vst21.16 {q2, q3}, [r1]! @@ -610,32 +607,32 @@ entry: define void @vst2_v8f16_align1(<8 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vst2_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vmovx.f16 s1, s6 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vins.f16 s1, s12 -; CHECK-NEXT: vins.f16 s6, s10 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vins.f16 s1, s0 ; CHECK-NEXT: vmovx.f16 s3, s7 -; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vins.f16 s6, s10 +; CHECK-NEXT: vins.f16 s3, s0 +; CHECK-NEXT: vmovx.f16 s10, s4 +; CHECK-NEXT: vmovx.f16 s0, s8 ; CHECK-NEXT: vins.f16 s7, s11 -; CHECK-NEXT: vins.f16 s3, s12 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmovx.f16 s14, s8 ; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vmovx.f16 s14, s5 +; CHECK-NEXT: vins.f16 s10, s0 +; CHECK-NEXT: vmovx.f16 s8, s5 ; CHECK-NEXT: vins.f16 s5, s9 -; CHECK-NEXT: vmovx.f16 s8, s9 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vins.f16 s8, s0 ; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vins.f16 s14, s8 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov.f32 s9, s12 -; CHECK-NEXT: vmov.f32 s10, s5 ; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vmov.f32 s13, s10 ; CHECK-NEXT: vstrb.8 q0, [r1, #16] -; CHECK-NEXT: vstrb.8 q2, [r1] +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov.f32 s15, s8 +; CHECK-NEXT: vstrb.8 q3, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 @@ -652,8 +649,8 @@ entry: define void @vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) { ; CHECK-LABEL: vst2_v2f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov.f64 d4, d3 ; CHECK-NEXT: vmov.f64 d5, d1 ; CHECK-NEXT: vmov.f64 d3, d0 @@ -675,17 +672,17 @@ define void @vst2_v4f64(<4 x double> *%src, <8 x double> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vmov.f64 d8, d4 ; CHECK-NEXT: vmov.f64 d9, d0 ; CHECK-NEXT: vmov.f64 d0, d5 ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.f64 d4, d6 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vmov.f64 d5, d2 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.f64 d4, d6 ; CHECK-NEXT: vmov.f64 d2, d7 ; CHECK-NEXT: vstrw.32 q2, [r1, #32] ; CHECK-NEXT: vstrw.32 q1, [r1, #48] diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll index 1e46dd1b256f5..7d4763fdeb03a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -12,16 +12,15 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) { ; CHECK-NEXT: ldrd r3, r2, [r0, #8] ; CHECK-NEXT: ldrd r4, r0, [r0, #16] ; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 -; CHECK-NEXT: vmov.32 q0[0], r4 ; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 -; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.32 q0[0], r4 ; CHECK-NEXT: vmov.f32 s8, s7 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.f64 d4, d2 +; CHECK-NEXT: vmov.32 q0[1], r0 ; CHECK-NEXT: vmov.f32 s9, s6 ; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: strd r2, r0, [r1, #16] ; CHECK-NEXT: pop {r4, pc} @@ -44,22 +43,22 @@ define void @vst3_v4i32(<4 x i32> *%src, <12 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vmov.f32 s17, s4 ; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s19, s13 ; CHECK-NEXT: vmov.f32 s9, s1 ; CHECK-NEXT: vmov.f32 s18, s0 ; CHECK-NEXT: vmov.f32 s0, s2 -; CHECK-NEXT: vstrw.32 q4, [r1] ; CHECK-NEXT: vmov.f32 s11, s6 -; CHECK-NEXT: vmov.f32 s1, s15 ; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vmov.f32 s16, s12 ; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s19, s13 +; CHECK-NEXT: vmov.f32 s1, s15 +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vmov.f32 s2, s7 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -84,41 +83,41 @@ define void @vst3_v8i32(<8 x i32> *%src, <24 x i32> *%dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] +; CHECK-NEXT: vldrw.u32 q7, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vmov.f64 d10, d8 +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vmov.f32 s21, s28 -; CHECK-NEXT: vmov.f64 d14, d12 -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vmov.f32 s29, s12 -; CHECK-NEXT: vmov.f32 s9, s27 -; CHECK-NEXT: vmov.f32 s31, s25 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s20, s28 +; CHECK-NEXT: vmov.f32 s9, s19 +; CHECK-NEXT: vmov.f32 s28, s16 +; CHECK-NEXT: vmov.f32 s31, s17 +; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vmov.f32 s23, s29 +; CHECK-NEXT: vstrw.32 q2, [r1, #80] +; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vmov.f32 s21, s24 +; CHECK-NEXT: vmov.f32 s29, s12 +; CHECK-NEXT: vstrw.32 q5, [r1] ; CHECK-NEXT: vmov.f32 s30, s0 ; CHECK-NEXT: vmov.f32 s0, s13 ; CHECK-NEXT: vstrw.32 q7, [r1, #48] ; CHECK-NEXT: vmov.f32 s3, s14 -; CHECK-NEXT: vmov.f32 s2, s26 -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vstrw.32 q0, [r1, #64] -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vstrw.32 q2, [r1, #80] -; CHECK-NEXT: vmov.f32 s12, s25 ; CHECK-NEXT: vmov.f32 s13, s5 -; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vstrw.32 q0, [r1, #64] ; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vmov.f32 s12, s25 ; CHECK-NEXT: vmov.f32 s15, s26 -; CHECK-NEXT: vmov.f32 s5, s19 ; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s6, s27 +; CHECK-NEXT: vmov.f32 s5, s19 ; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f32 s6, s27 ; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -142,108 +141,106 @@ define void @vst3_v16i32(<16 x i32> *%src, <48 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #160 -; CHECK-NEXT: sub sp, #160 +; CHECK-NEXT: .pad #144 +; CHECK-NEXT: sub sp, #144 ; CHECK-NEXT: vldrw.u32 q7, [r0, #96] +; CHECK-NEXT: vldrw.u32 q3, [r0, #160] ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vldrw.u32 q2, [r0, #128] -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q7, [r0, #80] -; CHECK-NEXT: vmov.f32 s16, s1 -; CHECK-NEXT: vldrw.u32 q3, [r0, #160] -; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q7, [r0, #48] -; CHECK-NEXT: vmov.f32 s17, s9 ; CHECK-NEXT: vstrw.32 q3, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s19, s2 -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vstrw.32 q7, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vldrw.u32 q5, [r0, #144] -; CHECK-NEXT: vldrw.u32 q1, [r0, #176] +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q7, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vstrw.32 q4, [r1, #16] -; CHECK-NEXT: vmov.f64 d8, d5 +; CHECK-NEXT: vmov.f32 s16, s1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #176] +; CHECK-NEXT: vmov.f32 s19, s2 ; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s17, s9 +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vldrw.u32 q7, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [r0, #144] +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: vmov.f32 s17, s27 ; CHECK-NEXT: vmov.f32 s19, s11 -; CHECK-NEXT: vmov.f32 s18, s3 ; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: vmov.f64 d8, d3 -; CHECK-NEXT: vmov.f32 s17, s31 +; CHECK-NEXT: vmov.f32 s16, s6 ; CHECK-NEXT: vmov.f32 s19, s7 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d8, d12 -; CHECK-NEXT: vmov.f32 s17, s0 -; CHECK-NEXT: vmov.f32 s19, s25 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s18, s31 +; CHECK-NEXT: vstrw.32 q4, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s18, s8 -; CHECK-NEXT: vmov q2, q7 -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.f64 d4, d14 ; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s4, s13 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vmov.f32 s0, s12 +; CHECK-NEXT: vmov.f64 d14, d4 ; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s7, s14 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.f32 s16, s24 +; CHECK-NEXT: vmov.f32 s19, s25 +; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d0, d14 +; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vmov.f32 s7, s30 ; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s31, s1 +; CHECK-NEXT: vmov.f64 d0, d10 ; CHECK-NEXT: vmov.f32 s16, s5 -; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vmov.f32 s19, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d12, d11 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov q0, q7 -; CHECK-NEXT: vmov.f32 s25, s7 -; CHECK-NEXT: vstrw.32 q4, [r1, #112] -; CHECK-NEXT: vmov.f32 s27, s23 -; CHECK-NEXT: vldrw.u32 q5, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vldrw.u32 q3, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s29, s20 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov.f32 s31, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vstrw.32 q2, [r1, #128] -; CHECK-NEXT: vmov.f32 s30, s0 -; CHECK-NEXT: vstrw.32 q6, [r1, #80] -; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov.f32 s29, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s24, s2 +; CHECK-NEXT: vmov.f32 s30, s4 +; CHECK-NEXT: vmov.f32 s27, s3 ; CHECK-NEXT: vstrw.32 q7, [r1, #96] -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vstrw.32 q0, [r1, #48] +; CHECK-NEXT: vmov.f32 s4, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vmov.f64 d10, d0 ; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vstrw.32 q4, [r1, #112] ; CHECK-NEXT: vstrw.32 q0, [r1, #144] ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s15, s6 ; CHECK-NEXT: vstrw.32 q0, [r1, #160] -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s13, s11 +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1, #176] -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [r1, #64] +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s25, s23 +; CHECK-NEXT: vstrw.32 q3, [r1, #128] +; CHECK-NEXT: vmov.f32 s26, s11 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #160 +; CHECK-NEXT: vmov.f32 s6, s20 +; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmov.f32 s20, s9 +; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vmov.f32 s23, s10 +; CHECK-NEXT: vstrw.32 q5, [r1, #64] +; CHECK-NEXT: add sp, #144 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -303,23 +300,23 @@ define void @vst3_v4i16(<4 x i16> *%src, <12 x i16> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q2, [r0, #16] ; CHECK-NEXT: vldrh.u32 q1, [r0] ; CHECK-NEXT: vldrh.u32 q0, [r0, #8] -; CHECK-NEXT: vmov.f64 d6, d5 -; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vldrh.u32 q2, [r0, #16] ; CHECK-NEXT: vmov r0, r5, d2 +; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vmov lr, r4, d1 ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.f32 s15, s11 +; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov r12, s6 +; CHECK-NEXT: vmov.32 q1[2], r4 ; CHECK-NEXT: vmov r0, r4, d4 +; CHECK-NEXT: vstrh.32 q1, [r1, #16] ; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov r12, s6 ; CHECK-NEXT: vmov.16 q0[3], r5 -; CHECK-NEXT: vstrh.32 q3, [r1, #16] ; CHECK-NEXT: vmov.16 q0[4], r3 ; CHECK-NEXT: vmov.16 q0[5], r4 ; CHECK-NEXT: vmov.16 q0[6], r12 @@ -343,64 +340,52 @@ entry: define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) { ; CHECK-LABEL: vst3_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f64 d0, d4 +; CHECK-NEXT: vmov.f32 s0, s8 ; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmovx.f16 s20, s8 ; CHECK-NEXT: vins.f16 s0, s4 ; CHECK-NEXT: vmov.f32 s12, s9 ; CHECK-NEXT: vins.f16 s12, s5 ; CHECK-NEXT: vmov.16 q0[4], r2 ; CHECK-NEXT: vmov.f32 s3, s12 ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.f32 s17, s12 -; CHECK-NEXT: vmov.f32 s18, s12 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmovx.f16 s20, s18 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmovx.f16 s20, s14 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s1, s17 -; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vmovx.f16 s8, s8 ; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vins.f16 s16, s20 -; CHECK-NEXT: vmovx.f16 s20, s15 +; CHECK-NEXT: vmov.f32 s1, s12 ; CHECK-NEXT: vins.f16 s17, s7 -; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vins.f16 s1, s8 +; CHECK-NEXT: vmovx.f16 s8, s12 +; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vmovx.f16 s8, s14 +; CHECK-NEXT: vins.f16 s16, s8 ; CHECK-NEXT: vmovx.f16 s19, s7 -; CHECK-NEXT: vrev32.16 q1, q1 -; CHECK-NEXT: vins.f16 s19, s20 -; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmovx.f16 s8, s15 ; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vins.f16 s21, s24 -; CHECK-NEXT: vmovx.f16 s24, s22 -; CHECK-NEXT: vins.f16 s18, s24 -; CHECK-NEXT: vmov.f32 s12, s13 -; CHECK-NEXT: vmov.f32 s22, s18 -; CHECK-NEXT: vmov.f32 s17, s21 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmovx.f16 s20, s9 -; CHECK-NEXT: vins.f16 s12, s20 -; CHECK-NEXT: vmovx.f16 s20, s10 -; CHECK-NEXT: vins.f16 s14, s20 +; CHECK-NEXT: vins.f16 s19, s8 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vmov.f32 s17, s11 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vins.f16 s17, s8 +; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vins.f16 s18, s8 +; CHECK-NEXT: vmov.f32 s8, s13 +; CHECK-NEXT: vins.f16 s8, s12 +; CHECK-NEXT: vmovx.f16 s12, s10 +; CHECK-NEXT: vins.f16 s14, s12 +; CHECK-NEXT: vrev32.16 q1, q1 +; CHECK-NEXT: vmovx.f16 s12, s13 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vins.f16 s5, s12 +; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vmov.f32 s9, s5 ; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: vmov.f32 s15, s14 -; CHECK-NEXT: vmov.f32 s14, s10 -; CHECK-NEXT: vmovx.f16 s8, s13 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vins.f16 s14, s8 -; CHECK-NEXT: vmov.f32 s6, s14 -; CHECK-NEXT: vmov.f32 s13, s5 -; CHECK-NEXT: vmov.f32 s14, s6 -; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -421,135 +406,112 @@ define void @vst3_v16i16(<16 x i16> *%src, <48 x i16> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #80 -; CHECK-NEXT: sub sp, #80 -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vmovx.f16 s0, s14 -; CHECK-NEXT: vmovx.f16 s8, s22 -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s0, s15 -; CHECK-NEXT: vins.f16 s9, s23 -; CHECK-NEXT: vmov.u16 r2, q6[1] -; CHECK-NEXT: vmovx.f16 s11, s23 -; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s11, s0 -; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vmovx.f16 s4, s9 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f32 s1, s11 -; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vins.f16 s1, s4 -; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vins.f16 s1, s7 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmovx.f16 s7, s7 +; CHECK-NEXT: vmov.f32 s12, s4 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov.f32 s18, s11 +; CHECK-NEXT: vmov.f32 s15, s7 +; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vins.f16 s13, s4 +; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vins.f16 s18, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s28, s4 -; CHECK-NEXT: vins.f16 s8, s24 -; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.f32 s11, s5 -; CHECK-NEXT: vins.f16 s11, s25 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov.f32 s20, s24 +; CHECK-NEXT: vins.f16 s20, s4 +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s23, s25 +; CHECK-NEXT: vmovx.f16 s4, s24 ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.u16 r0, q5[1] +; CHECK-NEXT: vmov.f32 s14, s18 +; CHECK-NEXT: vins.f16 s23, s5 +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s5, s24 -; CHECK-NEXT: vmov.f32 s6, s24 -; CHECK-NEXT: vins.f16 s5, s28 -; CHECK-NEXT: vmovx.f16 s28, s6 -; CHECK-NEXT: vins.f16 s10, s28 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmov.f64 d14, d2 -; CHECK-NEXT: vins.f16 s28, s20 -; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vins.f16 s0, s21 +; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s28, s0 +; CHECK-NEXT: vins.f16 s5, s4 +; CHECK-NEXT: vmovx.f16 s4, s24 +; CHECK-NEXT: vmov.u16 r0, q3[1] +; CHECK-NEXT: vins.f16 s28, s12 +; CHECK-NEXT: vins.f16 s22, s4 +; CHECK-NEXT: vmov.f32 s4, s1 ; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.f32 s31, s0 -; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s29, s4 -; CHECK-NEXT: vmovx.f16 s4, s4 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vins.f16 s1, s4 -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vins.f16 s30, s4 -; CHECK-NEXT: vmovx.f16 s4, s26 -; CHECK-NEXT: vmov.f32 s2, s30 -; CHECK-NEXT: vmov.f32 s29, s1 -; CHECK-NEXT: vmov.f32 s12, s13 -; CHECK-NEXT: vmov.f32 s30, s2 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmov q1, q4 -; CHECK-NEXT: vins.f16 s1, s7 +; CHECK-NEXT: vins.f16 s4, s13 +; CHECK-NEXT: vmov.f32 s21, s5 +; CHECK-NEXT: vmov.f32 s31, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmov.f32 s29, s8 +; CHECK-NEXT: vins.f16 s29, s0 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vins.f16 s30, s0 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vins.f16 s5, s7 +; CHECK-NEXT: vmovx.f16 s7, s7 +; CHECK-NEXT: vmovx.f16 s0, s27 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov.f32 s13, s19 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s13, s0 +; CHECK-NEXT: vmov.f32 s14, s27 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov.f32 s12, s25 +; CHECK-NEXT: vins.f16 s14, s0 +; CHECK-NEXT: vmov.f32 s0, s9 +; CHECK-NEXT: vins.f16 s0, s6 +; CHECK-NEXT: vmovx.f16 s6, s2 +; CHECK-NEXT: vins.f16 s10, s6 +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s5, s13 ; CHECK-NEXT: vstrw.32 q7, [r1, #48] -; CHECK-NEXT: vmovx.f16 s3, s7 -; CHECK-NEXT: vmovx.f16 s4, s27 -; CHECK-NEXT: vins.f16 s3, s4 -; CHECK-NEXT: vmov.f32 s5, s23 -; CHECK-NEXT: vmov.f32 s2, s27 -; CHECK-NEXT: vmovx.f16 s16, s1 -; CHECK-NEXT: vmov.f32 s6, s23 -; CHECK-NEXT: vins.f16 s5, s16 +; CHECK-NEXT: vrev32.16 q2, q2 +; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vins.f16 s9, s6 +; CHECK-NEXT: vmovx.f16 s6, s10 +; CHECK-NEXT: vins.f16 s12, s8 +; CHECK-NEXT: vmovx.f16 s8, s18 +; CHECK-NEXT: vmov.f32 s10, s18 ; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s20, s6 -; CHECK-NEXT: vmov.f32 s24, s25 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmovx.f16 s20, s17 -; CHECK-NEXT: vins.f16 s12, s20 -; CHECK-NEXT: vmovx.f16 s20, s18 -; CHECK-NEXT: vins.f16 s14, s20 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s15, s14 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmovx.f16 s16, s13 -; CHECK-NEXT: vstr s16, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s1, s5 -; CHECK-NEXT: vrev32.16 q5, q4 -; CHECK-NEXT: vldr s16, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: vins.f16 s21, s16 -; CHECK-NEXT: vmovx.f16 s16, s22 -; CHECK-NEXT: vins.f16 s14, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmovx.f16 s4, s17 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vins.f16 s24, s4 -; CHECK-NEXT: vmovx.f16 s4, s18 -; CHECK-NEXT: vins.f16 s26, s4 -; CHECK-NEXT: vmov.f32 s13, s21 -; CHECK-NEXT: vmov.f32 s27, s26 -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s26, s18 -; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s4, s25 +; CHECK-NEXT: vins.f16 s26, s8 +; CHECK-NEXT: vmov.f32 s15, s26 +; CHECK-NEXT: vmovx.f16 s8, s25 +; CHECK-NEXT: vrev32.16 q6, q4 +; CHECK-NEXT: vins.f16 s2, s6 +; CHECK-NEXT: vins.f16 s25, s8 +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vmovx.f16 s8, s26 +; CHECK-NEXT: vstrw.32 q0, [r1, #64] ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vrev32.16 q4, q4 -; CHECK-NEXT: vins.f16 s17, s4 -; CHECK-NEXT: vmovx.f16 s4, s18 -; CHECK-NEXT: vins.f16 s26, s4 -; CHECK-NEXT: vmov.f32 s14, s22 -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vstrw.32 q3, [r1, #64] -; CHECK-NEXT: vmov.f32 s25, s17 +; CHECK-NEXT: vins.f16 s10, s8 +; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vmov.f32 s14, s10 +; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: vmov.f32 s13, s25 ; CHECK-NEXT: vstrw.32 q0, [r1, #80] -; CHECK-NEXT: vmov.f32 s26, s18 -; CHECK-NEXT: vstrw.32 q6, [r1, #16] -; CHECK-NEXT: add sp, #80 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -664,29 +626,26 @@ entry: define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) { ; CHECK-LABEL: vst3_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vldrb.u16 q1, [r0, #16] +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vldrb.u16 q2, [r0, #8] -; CHECK-NEXT: vmovx.f16 s12, s6 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s0, s12 -; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vldrb.u16 q1, [r0, #16] +; CHECK-NEXT: vldrb.u16 q3, [r0] ; CHECK-NEXT: vins.f16 s1, s11 +; CHECK-NEXT: vmovx.f16 s2, s6 +; CHECK-NEXT: vmovx.f16 s0, s10 ; CHECK-NEXT: vmovx.f16 s3, s11 -; CHECK-NEXT: vins.f16 s3, s12 -; CHECK-NEXT: vldrb.u16 q3, [r0] -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmovx.f16 s20, s1 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s7 +; CHECK-NEXT: vmovx.f16 s1, s1 ; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vins.f16 s3, s2 +; CHECK-NEXT: vins.f16 s17, s1 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vmovx.f16 s1, s15 ; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmovx.f16 s20, s18 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vins.f16 s2, s1 ; CHECK-NEXT: vmov.f32 s1, s17 -; CHECK-NEXT: vmov.f32 s2, s18 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u16 r0, q2[0] ; CHECK-NEXT: vstrb.16 q0, [r1, #16] @@ -720,7 +679,7 @@ define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) { ; CHECK-NEXT: vmov.u16 r0, q3[5] ; CHECK-NEXT: vmov.8 q4[15], r0 ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0 @@ -957,11 +916,9 @@ entry: define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) { ; CHECK-LABEL: vst3_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f64 d6, d5 -; CHECK-NEXT: vmov.f32 s13, s11 ; CHECK-NEXT: vmov.f32 s14, s2 ; CHECK-NEXT: vmov.f32 s15, s3 ; CHECK-NEXT: vmov.f32 s2, s6 @@ -969,8 +926,10 @@ define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) { ; CHECK-NEXT: vmov.f32 s6, s8 ; CHECK-NEXT: vmov.f32 s7, s9 ; CHECK-NEXT: vstrb.8 q1, [r1], #32 -; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vmov.f32 s12, s10 +; CHECK-NEXT: vmov.f32 s13, s11 ; CHECK-NEXT: vstrw.32 q0, [r1, #-16] +; CHECK-NEXT: vstrw.32 q3, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0 @@ -991,41 +950,37 @@ define void @vst3_v4i64(<4 x i64> *%src, <12 x i64> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.f64 d10, d2 -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #32] ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f32 s16, s14 +; CHECK-NEXT: vldrw.u32 q6, [r0, #16] +; CHECK-NEXT: vmov.f32 s17, s15 ; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vmov.f64 d7, d15 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vmov.f32 s19, s3 +; CHECK-NEXT: vmov.f32 s20, s4 +; CHECK-NEXT: vstrw.32 q4, [r1, #80] ; CHECK-NEXT: vmov.f32 s21, s5 ; CHECK-NEXT: vmov.f32 s22, s28 ; CHECK-NEXT: vmov.f32 s23, s29 -; CHECK-NEXT: vmov.f64 d14, d12 +; CHECK-NEXT: vmov.f32 s4, s8 ; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vmov.f32 s5, s9 +; CHECK-NEXT: vmov.f32 s28, s24 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vmov.f32 s29, s25 -; CHECK-NEXT: vmov.f64 d8, d7 ; CHECK-NEXT: vmov.f32 s30, s12 -; CHECK-NEXT: vmov.f32 s17, s15 ; CHECK-NEXT: vmov.f32 s31, s13 -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vstrw.32 q7, [r1, #48] -; CHECK-NEXT: vmov.f32 s4, s8 -; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmov.f32 s2, s26 -; CHECK-NEXT: vstrw.32 q4, [r1, #80] -; CHECK-NEXT: vmov.f32 s5, s9 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vstrw.32 q7, [r1, #48] ; CHECK-NEXT: vmov.f32 s3, s27 -; CHECK-NEXT: vmov.f32 s9, s15 +; CHECK-NEXT: vmov.f32 s8, s14 ; CHECK-NEXT: vstrw.32 q0, [r1, #64] +; CHECK-NEXT: vmov.f32 s9, s15 ; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1047,10 +1002,10 @@ entry: define void @vst3_v2f32(<2 x float> *%src, <6 x float> *%dst) { ; CHECK-LABEL: vst3_v2f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0, #20] ; CHECK-NEXT: vldr s0, [r0] ; CHECK-NEXT: vldr s3, [r0, #4] ; CHECK-NEXT: vldr s1, [r0, #8] -; CHECK-NEXT: ldr r2, [r0, #20] ; CHECK-NEXT: vldr s2, [r0, #16] ; CHECK-NEXT: ldr r0, [r0, #12] ; CHECK-NEXT: strd r0, r2, [r1, #16] @@ -1075,22 +1030,22 @@ define void @vst3_v4f32(<4 x float> *%src, <12 x float> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vmov.f32 s17, s0 ; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vmov.f32 s19, s13 ; CHECK-NEXT: vmov.f32 s9, s5 ; CHECK-NEXT: vmov.f32 s18, s4 ; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vstrw.32 q4, [r1] ; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov.f32 s5, s15 ; CHECK-NEXT: vmov.f32 s10, s14 -; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s16, s12 ; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s17, s0 +; CHECK-NEXT: vmov.f32 s19, s13 +; CHECK-NEXT: vmov.f32 s5, s15 +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr @@ -1115,41 +1070,41 @@ define void @vst3_v8f32(<8 x float> *%src, <24 x float> *%dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vmov.f64 d10, d8 -; CHECK-NEXT: vldrw.u32 q7, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vmov.f32 s21, s24 -; CHECK-NEXT: vmov.f64 d12, d4 -; CHECK-NEXT: vmov.f64 d6, d1 -; CHECK-NEXT: vmov.f32 s25, s28 -; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov.f32 s27, s9 +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s20, s24 +; CHECK-NEXT: vmov.f32 s13, s19 +; CHECK-NEXT: vmov.f32 s24, s16 +; CHECK-NEXT: vmov.f32 s27, s17 +; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vmov.f32 s23, s25 +; CHECK-NEXT: vstrw.32 q3, [r1, #80] +; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vmov.f32 s21, s28 +; CHECK-NEXT: vmov.f32 s25, s8 +; CHECK-NEXT: vstrw.32 q5, [r1] ; CHECK-NEXT: vmov.f32 s26, s0 -; CHECK-NEXT: vmov.f32 s0, s29 +; CHECK-NEXT: vmov.f32 s0, s9 ; CHECK-NEXT: vstrw.32 q6, [r1, #48] -; CHECK-NEXT: vmov.f32 s3, s30 -; CHECK-NEXT: vmov.f32 s14, s31 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s23, s17 -; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.f32 s8, s29 -; CHECK-NEXT: vstrw.32 q0, [r1, #64] +; CHECK-NEXT: vmov.f32 s3, s10 ; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vstrw.32 q0, [r1, #64] ; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vmov.f32 s8, s29 ; CHECK-NEXT: vmov.f32 s11, s30 -; CHECK-NEXT: vmov.f32 s5, s19 ; CHECK-NEXT: vmov.f32 s10, s18 -; CHECK-NEXT: vmov.f32 s6, s31 +; CHECK-NEXT: vmov.f32 s5, s19 ; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s6, s31 ; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -1173,107 +1128,106 @@ define void @vst3_v16f32(<16 x float> *%src, <48 x float> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #160 -; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: .pad #144 +; CHECK-NEXT: sub sp, #144 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] +; CHECK-NEXT: vldrw.u32 q7, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] ; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vstrw.32 q5, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] ; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [r0, #80] +; CHECK-NEXT: vldrw.u32 q7, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0, #160] +; CHECK-NEXT: vstrw.32 q5, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vstrw.32 q7, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s13, s9 ; CHECK-NEXT: vmov.f32 s15, s2 -; CHECK-NEXT: vldrw.u32 q4, [r0, #160] -; CHECK-NEXT: vstrw.32 q5, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #176] ; CHECK-NEXT: vmov.f32 s14, s26 +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q4, [sp, #128] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q4, [r0, #144] -; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q7, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #176] -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] ; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: vmov.f64 d6, d5 -; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s12, s10 ; CHECK-NEXT: vmov.f32 s13, s27 ; CHECK-NEXT: vmov.f32 s15, s11 -; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vmov.f64 d6, d3 ; CHECK-NEXT: vmov.f32 s13, s23 +; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.f32 s15, s7 ; CHECK-NEXT: vmov.f32 s14, s31 -; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d6, d12 +; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s15, s25 ; CHECK-NEXT: vmov.f32 s14, s8 -; CHECK-NEXT: vmov q2, q7 -; CHECK-NEXT: vmov.f64 d0, d10 -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f64 d4, d14 +; CHECK-NEXT: vmov.f32 s0, s20 ; CHECK-NEXT: vmov.f32 s3, s21 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vmov.f64 d10, d2 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s20 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f64 d14, d2 ; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s7, s10 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s6, s22 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov.f32 s20, s5 +; CHECK-NEXT: vmov.f32 s20, s9 +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s23, s30 +; CHECK-NEXT: vmov.f32 s12, s24 +; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s15, s25 +; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s21, s1 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d12, d9 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov q0, q7 -; CHECK-NEXT: vmov.f32 s25, s7 -; CHECK-NEXT: vstrw.32 q5, [r1, #112] -; CHECK-NEXT: vmov.f32 s27, s19 -; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s29, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d0, d14 +; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s31, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vstrw.32 q2, [r1, #128] -; CHECK-NEXT: vmov.f32 s30, s0 -; CHECK-NEXT: vstrw.32 q6, [r1, #80] -; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: vmov.f64 d0, d8 +; CHECK-NEXT: vmov.f32 s20, s9 +; CHECK-NEXT: vmov.f32 s23, s10 +; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vmov.f32 s29, s8 +; CHECK-NEXT: vldrw.u32 q2, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s24, s2 +; CHECK-NEXT: vmov.f32 s30, s8 +; CHECK-NEXT: vmov.f32 s27, s3 ; CHECK-NEXT: vstrw.32 q7, [r1, #96] -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vstrw.32 q0, [r1, #48] +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f64 d8, d0 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s16, s13 +; CHECK-NEXT: vstrw.32 q5, [r1, #112] ; CHECK-NEXT: vstrw.32 q0, [r1, #144] ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s19, s14 ; CHECK-NEXT: vstrw.32 q0, [r1, #160] -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r1, #176] -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q4, [r1, #64] +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s25, s19 +; CHECK-NEXT: vstrw.32 q3, [r1, #128] +; CHECK-NEXT: vmov.f32 s26, s7 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #160 +; CHECK-NEXT: vmov.f32 s10, s16 +; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vmov.f32 s19, s6 +; CHECK-NEXT: vstrw.32 q4, [r1, #64] +; CHECK-NEXT: add sp, #144 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1297,14 +1251,14 @@ define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldmia r0, {s0, s1} ; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s2, s0 ; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmovx.f16 s6, s4 +; CHECK-NEXT: vins.f16 s4, s2 ; CHECK-NEXT: vmovx.f16 s2, s1 -; CHECK-NEXT: vmovx.f16 s10, s4 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vins.f16 s2, s10 ; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vins.f16 s2, s6 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: stm r1!, {r0, r2, r3} @@ -1328,8 +1282,6 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: ldrd r2, r12, [r0] ; CHECK-NEXT: ldrd r3, lr, [r0, #8] ; CHECK-NEXT: vmov.32 q0[0], r2 @@ -1337,30 +1289,29 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) { ; CHECK-NEXT: vmov.32 q1[0], r3 ; CHECK-NEXT: vmov.32 q0[1], r12 ; CHECK-NEXT: vmov.32 q1[1], lr -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmovx.f16 s10, s0 ; CHECK-NEXT: vmov.f32 s8, s1 -; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: vins.f16 s8, s5 +; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmovx.f16 s13, s3 +; CHECK-NEXT: vmovx.f16 s6, s0 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s10, s4 ; CHECK-NEXT: vmovx.f16 s2, s2 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vins.f16 s4, s10 -; CHECK-NEXT: vins.f16 s2, s12 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmovx.f16 s17, s3 -; CHECK-NEXT: vins.f16 s5, s10 -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s2, s10 +; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vins.f16 s13, s10 +; CHECK-NEXT: vmov.f32 s12, s5 ; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vmov.f32 s3, s8 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov r0, r2, d8 +; CHECK-NEXT: vmov r0, r2, d6 ; CHECK-NEXT: strd r0, r2, [r1, #16] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 @@ -1379,65 +1330,53 @@ entry: define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) { ; CHECK-LABEL: vst3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vmovx.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vins.f16 s0, s20 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vins.f16 s4, s21 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmovx.f16 s2, s12 +; CHECK-NEXT: vins.f16 s0, s12 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.f32 s3, s4 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vins.f16 s2, s12 -; CHECK-NEXT: vmovx.f16 s15, s23 -; CHECK-NEXT: vins.f16 s15, s24 -; CHECK-NEXT: vmovx.f16 s24, s6 -; CHECK-NEXT: vmovx.f16 s12, s22 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vins.f16 s12, s24 -; CHECK-NEXT: vmov.f32 s25, s11 -; CHECK-NEXT: vins.f16 s13, s23 -; CHECK-NEXT: vmov.f32 s26, s11 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmovx.f16 s28, s13 -; CHECK-NEXT: vins.f16 s25, s28 -; CHECK-NEXT: vmovx.f16 s28, s26 -; CHECK-NEXT: vins.f16 s14, s28 -; CHECK-NEXT: vmovx.f16 s28, s9 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vrev32.16 q5, q5 -; CHECK-NEXT: vins.f16 s4, s28 -; CHECK-NEXT: vmovx.f16 s28, s10 -; CHECK-NEXT: vins.f16 s6, s28 -; CHECK-NEXT: vmov.f32 s26, s14 -; CHECK-NEXT: vmov.f32 s7, s6 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vins.f16 s21, s8 -; CHECK-NEXT: vmovx.f16 s8, s22 -; CHECK-NEXT: vins.f16 s6, s8 -; CHECK-NEXT: vmov.f32 s1, s17 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s13, s25 -; CHECK-NEXT: vmov.f32 s5, s21 -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov.f32 s14, s26 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vmov.f32 s1, s16 +; CHECK-NEXT: vmovx.f16 s11, s15 +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vins.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s4, s16 +; CHECK-NEXT: vins.f16 s8, s13 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s19 +; CHECK-NEXT: vmov.f32 s3, s8 +; CHECK-NEXT: vins.f16 s11, s4 +; CHECK-NEXT: vmovx.f16 s4, s18 +; CHECK-NEXT: vmovx.f16 s8, s14 +; CHECK-NEXT: vins.f16 s9, s15 +; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s4, s9 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vins.f16 s9, s4 +; CHECK-NEXT: vrev32.16 q3, q3 +; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vmovx.f16 s4, s7 +; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmov.f32 s12, s17 +; CHECK-NEXT: vstrw.32 q2, [r1, #32] +; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vins.f16 s18, s4 +; CHECK-NEXT: vmovx.f16 s4, s17 +; CHECK-NEXT: vins.f16 s13, s4 +; CHECK-NEXT: vmovx.f16 s4, s14 +; CHECK-NEXT: vins.f16 s6, s4 +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmov.f32 s14, s6 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vmov.f32 s6, s22 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 @@ -1458,150 +1397,121 @@ define void @vst3_v16f16(<16 x half> *%src, <48 x half> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #128 -; CHECK-NEXT: sub sp, #128 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vmovx.f16 s7, s15 +; CHECK-NEXT: .pad #96 +; CHECK-NEXT: sub sp, #96 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vmovx.f16 s0, s15 +; CHECK-NEXT: vmovx.f16 s7, s11 ; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vstrw.32 q5, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov q6, q2 +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vins.f16 s1, s11 +; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmov.f64 d14, d12 -; CHECK-NEXT: vins.f16 s5, s15 -; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov.f32 s5, s27 -; CHECK-NEXT: vmov.f32 s6, s27 -; CHECK-NEXT: vins.f16 s28, s12 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s5, s11 ; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vins.f16 s10, s0 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d2, d10 -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vmovx.f16 s2, s8 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s0, s21 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vins.f16 s0, s9 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmovx.f16 s2, s12 -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vmov.f32 s5, s20 -; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.f32 s9, s20 -; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.f32 s10, s20 -; CHECK-NEXT: vins.f16 s9, s0 -; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmov q7, q4 ; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmov.f32 s0, s25 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vins.f16 s0, s13 -; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmovx.f16 s2, s20 +; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s4, s16 +; CHECK-NEXT: vins.f16 s4, s20 +; CHECK-NEXT: vmov.f32 s0, s17 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vstrw.32 q7, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s16, s4 +; CHECK-NEXT: vmovx.f16 s4, s28 +; CHECK-NEXT: vldrw.u32 q7, [r0, #80] +; CHECK-NEXT: vmov.16 q4[4], r2 +; CHECK-NEXT: vins.f16 s0, s21 +; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s19, s0 +; CHECK-NEXT: vmovx.f16 s0, s28 +; CHECK-NEXT: vins.f16 s18, s0 +; CHECK-NEXT: vmov.f64 d0, d4 +; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmovx.f16 s8, s24 +; CHECK-NEXT: vmov.f32 s22, s28 +; CHECK-NEXT: vins.f16 s20, s24 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f32 s17, s28 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vins.f16 s17, s4 +; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmov.f32 s21, s12 +; CHECK-NEXT: vmovx.f16 s24, s10 +; CHECK-NEXT: vins.f16 s21, s0 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vins.f16 s22, s0 +; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vins.f16 s24, s0 +; CHECK-NEXT: vmovx.f16 s0, s31 +; CHECK-NEXT: vmovx.f16 s27, s11 +; CHECK-NEXT: vins.f16 s4, s25 +; CHECK-NEXT: vins.f16 s27, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s25, s11 +; CHECK-NEXT: vmov.f32 s23, s4 +; CHECK-NEXT: vmovx.f16 s4, s25 +; CHECK-NEXT: vmov.f32 s25, s3 +; CHECK-NEXT: vmov.f32 s26, s31 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vins.f16 s25, s4 +; CHECK-NEXT: vins.f16 s26, s0 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmov.f32 s0, s29 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vins.f16 s30, s4 +; CHECK-NEXT: vmov.f32 s6, s18 +; CHECK-NEXT: vrev32.16 q2, q2 +; CHECK-NEXT: vmovx.f16 s4, s29 +; CHECK-NEXT: vmov.f32 s3, s30 +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s9, s4 +; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s29 +; CHECK-NEXT: vmov.f32 s8, s13 +; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s4, s30 +; CHECK-NEXT: vins.f16 s14, s4 +; CHECK-NEXT: vmov.f32 s10, s30 +; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vmovx.f16 s4, s13 ; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s31, s0 -; CHECK-NEXT: vmovx.f16 s0, s24 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmov.f32 s29, s24 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vins.f16 s30, s0 -; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vldrw.u32 q7, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vrev32.16 q3, q3 +; CHECK-NEXT: vmov.f32 s6, s30 +; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s13, s4 ; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vmov.f32 s8, s9 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s23 -; CHECK-NEXT: vmovx.f16 s7, s15 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vins.f16 s5, s15 -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s6, s23 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vmov.f32 s1, s15 -; CHECK-NEXT: vmov.f32 s2, s15 -; CHECK-NEXT: vins.f16 s1, s16 -; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vins.f16 s6, s16 -; CHECK-NEXT: vmovx.f16 s16, s13 -; CHECK-NEXT: vmov.f32 s20, s21 -; CHECK-NEXT: vins.f16 s20, s16 -; CHECK-NEXT: vmovx.f16 s16, s14 -; CHECK-NEXT: vins.f16 s22, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s23, s22 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vstrw.32 q3, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s30 -; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmovx.f16 s12, s21 -; CHECK-NEXT: vstr s12, [sp, #64] @ 4-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s5, s1 -; CHECK-NEXT: vrev32.16 q4, q3 -; CHECK-NEXT: vldr s12, [sp, #64] @ 4-byte Reload -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vins.f16 s22, s12 -; CHECK-NEXT: vmovx.f16 s12, s25 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vins.f16 s8, s12 -; CHECK-NEXT: vmovx.f16 s0, s26 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vins.f16 s10, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s11, s10 -; CHECK-NEXT: vstrw.32 q1, [r1, #80] -; CHECK-NEXT: vmov.f32 s10, s26 -; CHECK-NEXT: vrev32.16 q6, q0 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s25, s12 -; CHECK-NEXT: vmovx.f16 s12, s26 -; CHECK-NEXT: vins.f16 s10, s12 -; CHECK-NEXT: vmov.f32 s29, s1 -; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s30, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s10 -; CHECK-NEXT: vmov.f32 s1, s13 -; CHECK-NEXT: vstrw.32 q7, [r1] -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vmov.f32 s13, s1 -; CHECK-NEXT: vmov.f32 s21, s17 -; CHECK-NEXT: vmov.f32 s9, s25 -; CHECK-NEXT: vmov.f32 s22, s18 -; CHECK-NEXT: vmov.f32 s10, s26 -; CHECK-NEXT: vstrw.32 q5, [r1, #64] +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vins.f16 s10, s4 +; CHECK-NEXT: vmov.f32 s9, s13 +; CHECK-NEXT: vmov.f32 s4, s28 ; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vstrw.32 q0, [r1, #48] -; CHECK-NEXT: add sp, #128 +; CHECK-NEXT: vmov.f32 s7, s31 +; CHECK-NEXT: vstrw.32 q4, [r1, #48] +; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: vstrw.32 q0, [r1, #64] +; CHECK-NEXT: add sp, #96 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1623,8 +1533,8 @@ entry: define void @vst3_v2f64(<2 x double> *%src, <6 x double> *%dst) { ; CHECK-LABEL: vst3_v2f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vmov.f64 d6, d2 ; CHECK-NEXT: vmov.f64 d7, d1 @@ -1653,32 +1563,28 @@ define void @vst3_v4f64(<4 x double> *%src, <12 x double> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrw.u32 q7, [r0, #48] ; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov.f64 d6, d15 -; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmov.f64 d10, d2 +; CHECK-NEXT: vmov.f64 d15, d13 ; CHECK-NEXT: vmov.f64 d7, d1 -; CHECK-NEXT: vmov.f64 d11, d12 +; CHECK-NEXT: vmov.f64 d10, d2 ; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f64 d12, d4 +; CHECK-NEXT: vmov.f64 d11, d12 +; CHECK-NEXT: vmov.f64 d2, d8 ; CHECK-NEXT: vstrw.32 q5, [r1] ; CHECK-NEXT: vmov.f64 d1, d5 -; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d2, d8 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vmov.f64 d8, d15 ; CHECK-NEXT: vstrw.32 q0, [r1, #64] +; CHECK-NEXT: vmov.f64 d12, d4 +; CHECK-NEXT: vstrw.32 q4, [r1, #32] ; CHECK-NEXT: vmov.f64 d13, d14 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmov.f64 d8, d5 ; CHECK-NEXT: vstrw.32 q6, [r1, #48] -; CHECK-NEXT: vstrw.32 q4, [r1, #32] -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll index 4c939fc09e59b..ee1fe9e69c255 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll @@ -104,21 +104,21 @@ define <8 x i64> *@vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f64 d4, d8 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vmov.f32 s8, s16 ; CHECK-NEXT: vmov.f32 s9, s17 ; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s1 ; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vmov.f32 s1, s19 -; CHECK-NEXT: vmov.f64 d8, d6 -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s17, s13 ; CHECK-NEXT: vmov.f32 s18, s4 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vmov.f32 s19, s5 +; CHECK-NEXT: vmov.f32 s16, s12 +; CHECK-NEXT: vmov.f32 s17, s13 ; CHECK-NEXT: vmov.f32 s4, s14 ; CHECK-NEXT: vstrw.32 q4, [r1, #16] ; CHECK-NEXT: vmov.f32 s5, s15 @@ -215,16 +215,16 @@ define <8 x double> *@vst4_v2f64(<2 x double> *%src, <8 x double> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vmov.f64 d2, d6 ; CHECK-NEXT: vmov.f64 d3, d0 ; CHECK-NEXT: vmov.f64 d0, d7 -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vstrw.32 q0, [r1, #32] ; CHECK-NEXT: vmov.f64 d7, d4 +; CHECK-NEXT: vstrw.32 q0, [r1, #32] +; CHECK-NEXT: vmov.f64 d6, d8 ; CHECK-NEXT: vmov.f64 d4, d9 ; CHECK-NEXT: vstrw.32 q3, [r1, #16] ; CHECK-NEXT: vstrw.32 q2, [r1, #48] diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index f275049eddfc6..db4a438ae076a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -13,20 +13,20 @@ define void @vst4_v2i32(<2 x i32> *%src, <8 x i32> *%dst) { ; CHECK-NEXT: ldrd r3, r2, [r0, #8] ; CHECK-NEXT: ldm r6, {r4, r5, r6} ; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 ; CHECK-NEXT: ldr r0, [r0, #28] +; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r6 -; CHECK-NEXT: vmov.f64 d4, d2 +; CHECK-NEXT: vmov.f32 s8, s4 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r0 ; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 ; CHECK-NEXT: vmov.f32 s10, s0 ; CHECK-NEXT: vmov.f32 s11, s2 +; CHECK-NEXT: vmov.f32 s6, s1 ; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 @@ -78,12 +78,12 @@ define void @vst4_v8i32(<8 x i32> *%src, <32 x i32> *%dst) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q7, [r0, #96] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] @@ -120,55 +120,50 @@ define void @vst4_v16i32(<16 x i32> *%src, <64 x i32> *%dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #192 ; CHECK-NEXT: sub sp, #192 -; CHECK-NEXT: add r2, sp, #128 -; CHECK-NEXT: vldrw.u32 q7, [r0, #240] -; CHECK-NEXT: vldrw.u32 q3, [r0, #208] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q7, [r0, #224] +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q3, [r0, #208] ; CHECK-NEXT: vldrw.u32 q2, [r0, #144] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] ; CHECK-NEXT: add r2, sp, #128 -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q3, [r0, #192] +; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vldrw.u32 q4, [r0, #240] +; CHECK-NEXT: vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q6, [r0, #160] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q5, [r0, #96] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #192] -; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vldrw.u32 q3, [r0, #224] +; CHECK-NEXT: vldrw.u32 q1, [r0, #96] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vmov q6, q2 +; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] @@ -210,28 +205,28 @@ define void @vst4_v4i32_align1(<4 x i32> *%src, <16 x i32> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f32 s12, s17 -; CHECK-NEXT: vmov.f64 d10, d8 -; CHECK-NEXT: vmov.f32 s24, s19 -; CHECK-NEXT: vmov.f32 s13, s9 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vmov.f32 s14, s1 ; CHECK-NEXT: vmov.f32 s22, s0 ; CHECK-NEXT: vmov.f32 s26, s3 -; CHECK-NEXT: vmov.f32 s0, s18 -; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s12, s17 +; CHECK-NEXT: vmov.f32 s13, s9 ; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s23, s4 +; CHECK-NEXT: vmov.f32 s20, s16 ; CHECK-NEXT: vstrb.8 q3, [r1, #16] -; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vmov.f32 s21, s8 +; CHECK-NEXT: vmov.f32 s23, s4 +; CHECK-NEXT: vmov.f32 s24, s19 ; CHECK-NEXT: vstrb.8 q5, [r1] -; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vstrb.8 q6, [r1, #48] +; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vstrb.8 q0, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr @@ -385,12 +380,12 @@ define void @vst4_v16i16(<16 x i16> *%src, <64 x i16> *%dst) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q7, [r0, #96] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1] @@ -421,61 +416,61 @@ entry: define void @vst4_v8i16_align1(<8 x i16> *%src, <32 x i16> *%dst) { ; CHECK-LABEL: vst4_v8i16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #16] ; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vins.f16 s5, s9 ; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmovx.f16 s20, s17 +; CHECK-NEXT: vins.f16 s5, s9 ; CHECK-NEXT: vins.f16 s12, s0 ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov.f32 s3, s12 -; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vmovx.f16 s27, s4 ; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vmov.f32 s3, s12 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmov.f32 s5, s4 +; CHECK-NEXT: vmovx.f16 s8, s8 +; CHECK-NEXT: vmovx.f16 s0, s17 ; CHECK-NEXT: vmovx.f16 s2, s13 +; CHECK-NEXT: vins.f16 s27, s8 +; CHECK-NEXT: vmovx.f16 s4, s12 +; CHECK-NEXT: vmovx.f16 s8, s16 ; CHECK-NEXT: vins.f16 s13, s17 -; CHECK-NEXT: vins.f16 s2, s20 -; CHECK-NEXT: vmovx.f16 s20, s8 -; CHECK-NEXT: vins.f16 s27, s20 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vmovx.f16 s28, s12 -; CHECK-NEXT: vmovx.f16 s20, s16 ; CHECK-NEXT: vins.f16 s12, s16 -; CHECK-NEXT: vins.f16 s28, s20 ; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov.f32 s25, s4 -; CHECK-NEXT: vmov.f32 s22, s28 -; CHECK-NEXT: vmovx.f16 s28, s11 -; CHECK-NEXT: vmov.f32 s21, s4 -; CHECK-NEXT: vmovx.f16 s8, s10 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vmovx.f16 s4, s11 ; CHECK-NEXT: vmov.f32 s23, s27 ; CHECK-NEXT: vmovx.f16 s27, s7 -; CHECK-NEXT: vins.f16 s27, s28 ; CHECK-NEXT: vins.f16 s7, s11 -; CHECK-NEXT: vmov.f32 s25, s7 -; CHECK-NEXT: vmovx.f16 s28, s19 +; CHECK-NEXT: vins.f16 s27, s4 ; CHECK-NEXT: vmovx.f16 s26, s15 -; CHECK-NEXT: vins.f16 s15, s19 -; CHECK-NEXT: vins.f16 s26, s28 -; CHECK-NEXT: vmovx.f16 s31, s6 +; CHECK-NEXT: vmovx.f16 s4, s19 +; CHECK-NEXT: vmov.f32 s25, s7 +; CHECK-NEXT: vins.f16 s26, s4 +; CHECK-NEXT: vmovx.f16 s7, s6 +; CHECK-NEXT: vmovx.f16 s4, s10 ; CHECK-NEXT: vins.f16 s6, s10 -; CHECK-NEXT: vins.f16 s31, s8 -; CHECK-NEXT: vmov.f32 s29, s6 +; CHECK-NEXT: vmov.f32 s21, s5 +; CHECK-NEXT: vins.f16 s15, s19 +; CHECK-NEXT: vins.f16 s7, s4 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmovx.f16 s6, s14 ; CHECK-NEXT: vmovx.f16 s4, s18 -; CHECK-NEXT: vmovx.f16 s30, s14 -; CHECK-NEXT: vmov.f32 s24, s15 ; CHECK-NEXT: vins.f16 s14, s18 -; CHECK-NEXT: vins.f16 s30, s4 -; CHECK-NEXT: vmov.f32 s28, s14 +; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmov.f32 s0, s13 +; CHECK-NEXT: vmov.f32 s24, s15 +; CHECK-NEXT: vins.f16 s6, s4 +; CHECK-NEXT: vmov.f32 s4, s14 ; CHECK-NEXT: vstrb.8 q6, [r1, #48] -; CHECK-NEXT: vstrb.8 q7, [r1, #32] +; CHECK-NEXT: vstrb.8 q1, [r1, #32] ; CHECK-NEXT: vstrb.8 q0, [r1, #16] ; CHECK-NEXT: vstrb.8 q5, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0 @@ -723,25 +718,25 @@ define void @vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f64 d6, d8 -; CHECK-NEXT: vmov.f64 d10, d4 -; CHECK-NEXT: vmov.f32 s13, s17 -; CHECK-NEXT: vmov.f32 s21, s9 +; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vmov.f32 s14, s0 -; CHECK-NEXT: vmov.f32 s22, s4 ; CHECK-NEXT: vmov.f32 s15, s1 +; CHECK-NEXT: vmov.f32 s22, s4 ; CHECK-NEXT: vmov.f32 s23, s5 +; CHECK-NEXT: vmov.f32 s12, s16 +; CHECK-NEXT: vmov.f32 s13, s17 +; CHECK-NEXT: vmov.f32 s20, s8 ; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f32 s21, s9 ; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vstrw.32 q5, [r1] -; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.f32 s1, s19 -; CHECK-NEXT: vmov.f32 s5, s11 +; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vstrw.32 q0, [r1, #48] +; CHECK-NEXT: vmov.f32 s5, s11 ; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -766,57 +761,56 @@ define void @vst4_v4i64(<4 x i64> *%src, <16 x i64> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: vldrw.u32 q7, [r0] +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmov.f64 d12, d14 +; CHECK-NEXT: vldrw.u32 q7, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #96] -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s25, s29 -; CHECK-NEXT: vmov.f32 s26, s0 -; CHECK-NEXT: vmov.f32 s27, s1 -; CHECK-NEXT: vmov.f32 s0, s30 -; CHECK-NEXT: vstrw.32 q6, [r1] -; CHECK-NEXT: vmov.f32 s1, s31 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f64 d2, d6 -; CHECK-NEXT: vmov.f32 s5, s13 -; CHECK-NEXT: vmov.f64 d14, d0 -; CHECK-NEXT: vmov.f32 s29, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vmov.f64 d13, d1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vmov.f32 s4, s28 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s5, s29 +; CHECK-NEXT: vmov.f32 s24, s30 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s25, s31 +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vmov.f32 s4, s12 +; CHECK-NEXT: vmov.f32 s5, s13 ; CHECK-NEXT: vmov.f32 s8, s14 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vmov.f32 s9, s15 -; CHECK-NEXT: vmov.f64 d6, d0 +; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f64 d1, d15 ; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s13, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f64 d13, d7 +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vmov.f32 s15, s21 ; CHECK-NEXT: vmov.f32 s30, s16 +; CHECK-NEXT: vstrw.32 q3, [r1, #80] ; CHECK-NEXT: vmov.f32 s31, s17 +; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s16, s2 ; CHECK-NEXT: vstrw.32 q7, [r1, #64] ; CHECK-NEXT: vmov.f32 s17, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vstrw.32 q4, [r1, #96] -; CHECK-NEXT: vmov.f32 s15, s21 -; CHECK-NEXT: vmov.f32 s20, s2 -; CHECK-NEXT: vstrw.32 q3, [r1, #80] -; CHECK-NEXT: vmov.f32 s21, s3 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s20, s26 +; CHECK-NEXT: vstrw.32 q4, [r1, #96] +; CHECK-NEXT: vmov.f32 s21, s27 +; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vstrw.32 q5, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -901,12 +895,12 @@ define void @vst4_v8f32(<8 x float> *%src, <32 x float> *%dst) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q7, [r0, #96] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vst40.32 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.32 {q4, q5, q6, q7}, [r1] @@ -943,55 +937,50 @@ define void @vst4_v16f32(<16 x float> *%src, <64 x float> *%dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #192 ; CHECK-NEXT: sub sp, #192 -; CHECK-NEXT: add r2, sp, #128 -; CHECK-NEXT: vldrw.u32 q7, [r0, #240] -; CHECK-NEXT: vldrw.u32 q3, [r0, #208] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q7, [r0, #224] +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q3, [r0, #208] ; CHECK-NEXT: vldrw.u32 q2, [r0, #144] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] ; CHECK-NEXT: add r2, sp, #128 -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vldrw.u32 q3, [r0, #192] +; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vldrw.u32 q4, [r0, #240] +; CHECK-NEXT: vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill +; CHECK-NEXT: add r2, sp, #128 +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q6, [r0, #160] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q5, [r0, #96] -; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0, #192] -; CHECK-NEXT: vldrw.u32 q2, [r0, #128] +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vldrw.u32 q3, [r0, #224] +; CHECK-NEXT: vldrw.u32 q1, [r0, #96] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill +; CHECK-NEXT: vmov q6, q2 +; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r1] ; CHECK-NEXT: vst43.32 {q0, q1, q2, q3}, [r0]! -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst41.32 {q0, q1, q2, q3}, [r0] ; CHECK-NEXT: vst42.32 {q0, q1, q2, q3}, [r0] @@ -1033,28 +1022,28 @@ define void @vst4_v4f32_align1(<4 x float> *%src, <16 x float> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.f32 s12, s17 -; CHECK-NEXT: vmov.f64 d10, d8 -; CHECK-NEXT: vmov.f32 s24, s19 -; CHECK-NEXT: vmov.f32 s13, s9 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vmov.f32 s14, s1 ; CHECK-NEXT: vmov.f32 s22, s0 ; CHECK-NEXT: vmov.f32 s26, s3 -; CHECK-NEXT: vmov.f32 s0, s18 -; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s12, s17 +; CHECK-NEXT: vmov.f32 s13, s9 ; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s23, s4 +; CHECK-NEXT: vmov.f32 s20, s16 ; CHECK-NEXT: vstrb.8 q3, [r1, #16] -; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vmov.f32 s21, s8 +; CHECK-NEXT: vmov.f32 s23, s4 +; CHECK-NEXT: vmov.f32 s24, s19 ; CHECK-NEXT: vstrb.8 q5, [r1] -; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vstrb.8 q6, [r1, #48] +; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vstrb.8 q0, [r1, #32] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr @@ -1079,17 +1068,18 @@ entry: define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vst4_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldmia r0, {s0, s1} +; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldr s5, [r0, #4] ; CHECK-NEXT: vldr s4, [r0, #8] ; CHECK-NEXT: vmovx.f16 s2, s0 -; CHECK-NEXT: vldr s5, [r0, #12] -; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vldr s1, [r0, #12] +; CHECK-NEXT: vmovx.f16 s6, s5 ; CHECK-NEXT: vmovx.f16 s3, s4 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vins.f16 s3, s8 +; CHECK-NEXT: vins.f16 s2, s6 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s4, s1 +; CHECK-NEXT: vins.f16 s0, s5 +; CHECK-NEXT: vins.f16 s3, s6 ; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr @@ -1122,37 +1112,33 @@ define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) { ; CHECK-NEXT: vmov.32 q0[0], r3 ; CHECK-NEXT: vmov.32 q0[1], r12 ; CHECK-NEXT: ldrd r2, r12, [r0] -; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: ldrd r3, r0, [r0, #8] -; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 ; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vins.f16 s0, s4 ; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: vmov.32 q2[1], r0 ; CHECK-NEXT: vmov.32 q1[1], r12 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s7, s9 -; CHECK-NEXT: vmovx.f16 s14, s4 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vins.f16 s14, s8 +; CHECK-NEXT: vins.f16 s12, s2 +; CHECK-NEXT: vmovx.f16 s6, s4 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vins.f16 s6, s2 ; CHECK-NEXT: vmovx.f16 s11, s1 -; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vmovx.f16 s13, s3 -; CHECK-NEXT: vins.f16 s11, s13 +; CHECK-NEXT: vmovx.f16 s2, s3 ; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vmovx.f16 s13, s7 -; CHECK-NEXT: vins.f16 s10, s13 +; CHECK-NEXT: vins.f16 s11, s2 +; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vins.f16 s5, s9 +; CHECK-NEXT: vins.f16 s4, s8 ; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s5, s0 -; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vins.f16 s10, s2 ; CHECK-NEXT: vmov.f32 s9, s1 -; CHECK-NEXT: vmov.f32 s7, s12 +; CHECK-NEXT: vmov.f32 s5, s0 ; CHECK-NEXT: vstrh.16 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s7, s12 ; CHECK-NEXT: vstrh.16 q1, [r1] ; CHECK-NEXT: pop {r7, pc} entry: @@ -1205,12 +1191,12 @@ define void @vst4_v16f16(<16 x half> *%src, <64 x half> *%dst) { ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q7, [r0, #96] -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] ; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vst40.16 {q4, q5, q6, q7}, [r1] ; CHECK-NEXT: vst41.16 {q4, q5, q6, q7}, [r1] @@ -1241,70 +1227,61 @@ entry: define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) { ; CHECK-LABEL: vst4_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrw.u32 q7, [r0, #48] +; CHECK-NEXT: .vsave {d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q5, [r0, #48] ; CHECK-NEXT: vldrw.u32 q6, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vmovx.f16 s0, s29 -; CHECK-NEXT: vins.f16 s2, s0 -; CHECK-NEXT: vmovx.f16 s12, s25 -; CHECK-NEXT: vstr s2, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmovx.f16 s2, s21 +; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vmovx.f16 s2, s9 -; CHECK-NEXT: vins.f16 s5, s29 -; CHECK-NEXT: vins.f16 s2, s12 +; CHECK-NEXT: vmovx.f16 s12, s25 ; CHECK-NEXT: vmovx.f16 s19, s4 -; CHECK-NEXT: vmovx.f16 s12, s28 -; CHECK-NEXT: vins.f16 s9, s25 +; CHECK-NEXT: vins.f16 s2, s12 +; CHECK-NEXT: vmovx.f16 s12, s20 ; CHECK-NEXT: vins.f16 s19, s12 -; CHECK-NEXT: vmovx.f16 s14, s8 -; CHECK-NEXT: vmovx.f16 s12, s24 -; CHECK-NEXT: vins.f16 s14, s12 -; CHECK-NEXT: vins.f16 s4, s28 -; CHECK-NEXT: vstr s14, [sp] @ 4-byte Spill +; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vmovx.f16 s14, s24 ; CHECK-NEXT: vmovx.f16 s15, s7 -; CHECK-NEXT: vmovx.f16 s20, s31 -; CHECK-NEXT: vins.f16 s8, s24 -; CHECK-NEXT: vins.f16 s15, s20 -; CHECK-NEXT: vmovx.f16 s20, s27 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vmovx.f16 s14, s23 +; CHECK-NEXT: vins.f16 s15, s14 ; CHECK-NEXT: vmovx.f16 s14, s11 -; CHECK-NEXT: vins.f16 s7, s31 -; CHECK-NEXT: vins.f16 s14, s20 +; CHECK-NEXT: vmovx.f16 s1, s27 +; CHECK-NEXT: vins.f16 s7, s23 +; CHECK-NEXT: vins.f16 s14, s1 ; CHECK-NEXT: vmovx.f16 s23, s6 -; CHECK-NEXT: vmovx.f16 s28, s30 -; CHECK-NEXT: vins.f16 s6, s30 -; CHECK-NEXT: vins.f16 s23, s28 -; CHECK-NEXT: vins.f16 s11, s27 +; CHECK-NEXT: vmovx.f16 s1, s22 +; CHECK-NEXT: vins.f16 s6, s22 +; CHECK-NEXT: vins.f16 s5, s21 +; CHECK-NEXT: vins.f16 s4, s20 +; CHECK-NEXT: vins.f16 s23, s1 ; CHECK-NEXT: vmovx.f16 s22, s10 -; CHECK-NEXT: vmovx.f16 s24, s26 -; CHECK-NEXT: vldr s28, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vins.f16 s22, s24 ; CHECK-NEXT: vins.f16 s10, s26 +; CHECK-NEXT: vmovx.f16 s1, s26 +; CHECK-NEXT: vins.f16 s9, s25 +; CHECK-NEXT: vins.f16 s8, s24 +; CHECK-NEXT: vins.f16 s11, s27 ; CHECK-NEXT: vmov q6, q1 -; CHECK-NEXT: vmov.f32 s27, s28 -; CHECK-NEXT: vldr s28, [sp] @ 4-byte Reload -; CHECK-NEXT: vmov.f32 s0, s9 +; CHECK-NEXT: vins.f16 s22, s1 ; CHECK-NEXT: vmov.f32 s1, s25 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s21, s6 -; CHECK-NEXT: vmov.f32 s12, s11 -; CHECK-NEXT: vmov.f32 s20, s10 -; CHECK-NEXT: vstrb.8 q3, [r1, #48] -; CHECK-NEXT: vmov.f32 s3, s27 ; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov.f32 s26, s28 -; CHECK-NEXT: vstrb.8 q5, [r1, #32] -; CHECK-NEXT: vmov.f32 s25, s4 +; CHECK-NEXT: vmov.f32 s3, s0 +; CHECK-NEXT: vmov.f32 s0, s9 +; CHECK-NEXT: vmov.f32 s26, s12 ; CHECK-NEXT: vstrb.8 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.f32 s25, s4 ; CHECK-NEXT: vmov.f32 s27, s19 +; CHECK-NEXT: vmov.f32 s13, s7 ; CHECK-NEXT: vstrb.8 q6, [r1] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vmov.f32 s12, s11 +; CHECK-NEXT: vmov.f32 s21, s6 +; CHECK-NEXT: vstrb.8 q3, [r1, #48] +; CHECK-NEXT: vmov.f32 s20, s10 +; CHECK-NEXT: vstrb.8 q5, [r1, #32] +; CHECK-NEXT: vpop {d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 @@ -1329,15 +1306,15 @@ define void @vst4_v2f64(<2 x double> *%src, <8 x double> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vmov.f64 d10, d6 +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vmov.f64 d9, d0 +; CHECK-NEXT: vmov.f64 d8, d4 ; CHECK-NEXT: vmov.f64 d11, d2 ; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f64 d10, d6 ; CHECK-NEXT: vmov.f64 d0, d5 ; CHECK-NEXT: vstrw.32 q5, [r1] ; CHECK-NEXT: vmov.f64 d2, d7 @@ -1369,32 +1346,32 @@ define void @vst4_v4f64(<4 x double> *%src, <16 x double> *%dst) { ; CHECK-NEXT: .pad #64 ; CHECK-NEXT: sub sp, #64 ; CHECK-NEXT: vldrw.u32 q7, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d14, d12 +; CHECK-NEXT: vldrw.u32 q6, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #96] +; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d15, d10 +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0, #48] ; CHECK-NEXT: vldrw.u32 q4, [r0, #112] ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d15, d10 +; CHECK-NEXT: vmov.f64 d14, d12 ; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov.f64 d14, d4 ; CHECK-NEXT: vmov.f64 d15, d2 -; CHECK-NEXT: vmov.f64 d2, d5 ; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.f64 d4, d0 ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.f64 d10, d13 -; CHECK-NEXT: vmov.f64 d12, d0 +; CHECK-NEXT: vmov.f64 d2, d5 ; CHECK-NEXT: vstrw.32 q5, [r1, #32] ; CHECK-NEXT: vmov.f64 d5, d6 +; CHECK-NEXT: vstrw.32 q1, [r1, #48] ; CHECK-NEXT: vmov.f64 d13, d8 ; CHECK-NEXT: vstrw.32 q2, [r1, #64] +; CHECK-NEXT: vmov.f64 d12, d0 ; CHECK-NEXT: vmov.f64 d8, d1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q6, [r1, #80] diff --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll index d055469064e52..5e3546585e94b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll @@ -56,14 +56,14 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q4, [r0] -; CHECK-NEXT: vmov.f64 d0, d8 ; CHECK-NEXT: vmov.i64 q5, #0xffffffff +; CHECK-NEXT: vmov.f32 s0, s16 ; CHECK-NEXT: vmov.f32 s2, s17 ; CHECK-NEXT: vand q6, q0, q5 ; CHECK-NEXT: vmov r0, r1, d13 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: vmov.f64 d0, d9 +; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vmov.f32 s2, s19 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: vand q5, q0, q5 From 9de882fdbf7436c9ddd9b35b335ec91a524353a5 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 17 Aug 2021 09:20:24 -0400 Subject: [PATCH 205/700] [libc++][NFC] Refactor tests for transform_view Adjust the names of helper function objects to represent better what they do, as suggested in the review of D107098. --- .../range.transform/base.pass.cpp | 6 +- .../range.transform/begin.pass.cpp | 10 ++-- .../range.transform/ctad.compile.pass.cpp | 12 ++-- .../range.transform/end.pass.cpp | 28 +++++----- .../range.transform/general.pass.cpp | 4 +- .../iterator/arithmetic.pass.cpp | 4 +- .../range.transform/iterator/base.pass.cpp | 6 +- .../range.transform/iterator/compare.pass.cpp | 12 ++-- .../range.transform/iterator/ctor.pass.cpp | 10 ++-- .../range.transform/iterator/deref.pass.cpp | 55 +++++++++++++------ .../iterator/iter_move.pass.cpp | 6 +- .../iterator/plus_minus.pass.cpp | 4 +- .../iterator/requirements.compile.pass.cpp | 8 +-- .../iterator/sentinel.pass.cpp | 8 +-- .../iterator/subscript.pass.cpp | 10 ++-- .../range.transform/iterator/types.pass.cpp | 14 ++--- .../range.transform/size.pass.cpp | 10 ++-- .../range.adaptors/range.transform/types.h | 8 +-- 18 files changed, 117 insertions(+), 98 deletions(-) diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/base.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/base.pass.cpp index 882e1623dda7d..6e9572d12ada4 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/base.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/base.pass.cpp @@ -20,14 +20,14 @@ constexpr bool test() { { - std::ranges::transform_view transformView; + std::ranges::transform_view transformView; ContiguousView base = std::move(transformView).base(); ASSERT_SAME_TYPE(ContiguousView, decltype(std::move(transformView).base())); assert(std::ranges::begin(base) == globalBuff); } { - std::ranges::transform_view transformView; + std::ranges::transform_view transformView; CopyableView base1 = transformView.base(); ASSERT_SAME_TYPE(CopyableView, decltype(transformView.base())); assert(std::ranges::begin(base1) == globalBuff); @@ -38,7 +38,7 @@ constexpr bool test() { } { - const std::ranges::transform_view transformView; + const std::ranges::transform_view transformView; const CopyableView base1 = transformView.base(); ASSERT_SAME_TYPE(CopyableView, decltype(transformView.base())); assert(std::ranges::begin(base1) == globalBuff); diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/begin.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/begin.pass.cpp index 48232c5218613..b4665a5b74f49 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/begin.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/begin.pass.cpp @@ -27,29 +27,29 @@ constexpr bool test() { int buff[8] = {0, 1, 2, 3, 4, 5, 6, 7}; { - std::ranges::transform_view transformView(ContiguousView{buff}, Increment{}); + std::ranges::transform_view transformView(ContiguousView{buff}, PlusOneMutable{}); assert(transformView.begin().base() == buff); assert(*transformView.begin() == 1); } { - std::ranges::transform_view transformView(ForwardView{buff}, Increment{}); + std::ranges::transform_view transformView(ForwardView{buff}, PlusOneMutable{}); assert(transformView.begin().base().base() == buff); assert(*transformView.begin() == 1); } { - std::ranges::transform_view transformView(InputView{buff}, Increment{}); + std::ranges::transform_view transformView(InputView{buff}, PlusOneMutable{}); assert(transformView.begin().base().base() == buff); assert(*transformView.begin() == 1); } { - const std::ranges::transform_view transformView(ContiguousView{buff}, IncrementConst{}); + const std::ranges::transform_view transformView(ContiguousView{buff}, PlusOne{}); assert(*transformView.begin() == 1); } - static_assert(!BeginInvocable>); + static_assert(!BeginInvocable>); return true; } diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/ctad.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/ctad.compile.pass.cpp index c6da9f2e00c54..c07c79452d956 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/ctad.compile.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/ctad.compile.pass.cpp @@ -18,9 +18,9 @@ #include "test_macros.h" #include "types.h" -static_assert(std::same_as>); -static_assert(std::same_as(), Increment())), - std::ranges::transform_view, Increment>>); -static_assert(std::same_as, Increment>>); +static_assert(std::same_as>); +static_assert(std::same_as(), PlusOne())), + std::ranges::transform_view, PlusOne>>); +static_assert(std::same_as, PlusOne>>); diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/end.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/end.pass.cpp index 931906353e44a..9c11ef6469b6f 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/end.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/end.pass.cpp @@ -32,36 +32,36 @@ concept EndIsIter = requires(T t) { ++t.end(); }; constexpr bool test() { { - std::ranges::transform_view transformView(ContiguousView{}, Increment{}); + std::ranges::transform_view transformView(ContiguousView{}, PlusOneMutable{}); assert(transformView.end().base() == globalBuff + 8); } { - std::ranges::transform_view transformView(ForwardView{}, Increment{}); + std::ranges::transform_view transformView(ForwardView{}, PlusOneMutable{}); assert(transformView.end().base().base() == globalBuff + 8); } { - std::ranges::transform_view transformView(InputView{}, Increment{}); + std::ranges::transform_view transformView(InputView{}, PlusOneMutable{}); assert(transformView.end().base() == globalBuff + 8); } { - const std::ranges::transform_view transformView(ContiguousView{}, IncrementConst{}); + const std::ranges::transform_view transformView(ContiguousView{}, PlusOne{}); assert(transformView.end().base() == globalBuff + 8); } - static_assert(!EndInvocable>); - static_assert( EndInvocable< std::ranges::transform_view>); - static_assert( EndInvocable>); - static_assert(!EndInvocable>); - static_assert( EndInvocable< std::ranges::transform_view>); - static_assert( EndInvocable>); + static_assert(!EndInvocable>); + static_assert( EndInvocable< std::ranges::transform_view>); + static_assert( EndInvocable>); + static_assert(!EndInvocable>); + static_assert( EndInvocable< std::ranges::transform_view>); + static_assert( EndInvocable>); - static_assert(!EndIsIter>); - static_assert(!EndIsIter< std::ranges::transform_view>); - static_assert( EndIsIter>); - static_assert( EndIsIter< std::ranges::transform_view>); + static_assert(!EndIsIter>); + static_assert(!EndIsIter< std::ranges::transform_view>); + static_assert( EndIsIter>); + static_assert( EndIsIter< std::ranges::transform_view>); return true; } diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp index 9df8be1fe8275..aec7143903e4e 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp @@ -32,8 +32,8 @@ template concept ValidTransformView = requires { typename std::ranges::transform_view; }; struct BadFunction { }; -static_assert( ValidTransformView); -static_assert(!ValidTransformView); +static_assert( ValidTransformView); +static_assert(!ValidTransformView); static_assert(!ValidTransformView); template diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/arithmetic.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/arithmetic.pass.cpp index ad4b554bcc3df..cde7431bd82a6 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/arithmetic.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/arithmetic.pass.cpp @@ -18,7 +18,7 @@ #include "../types.h" constexpr bool test() { - std::ranges::transform_view transformView; + std::ranges::transform_view transformView; auto iter = std::move(transformView).begin(); assert((++iter).base() == globalBuff + 1); @@ -31,7 +31,7 @@ constexpr bool test() { // Check that decltype(InputIter++) == void. ASSERT_SAME_TYPE(decltype( - std::declval>>()++), + std::declval>>()++), void); assert((iter += 4).base() == globalBuff + 4); diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/base.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/base.pass.cpp index c1ecad20e706b..f60ed2a5c8214 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/base.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/base.pass.cpp @@ -24,7 +24,7 @@ concept BaseInvocable = requires(std::ranges::iterator_t transformView; + std::ranges::transform_view transformView; auto iter = std::move(transformView).begin(); ASSERT_SAME_TYPE(int*, decltype(iter.base())); assert(iter.base() == globalBuff); @@ -33,13 +33,13 @@ constexpr bool test() { } { - std::ranges::transform_view transformView; + std::ranges::transform_view transformView; auto iter = transformView.begin(); assert(std::move(iter).base() == globalBuff); ASSERT_SAME_TYPE(cpp20_input_iterator, decltype(std::move(iter).base())); } - static_assert(!BaseInvocable); + static_assert(!BaseInvocable); return true; } diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/compare.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/compare.pass.cpp index 5f734a819a38a..a6d0a5fbed8f1 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/compare.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/compare.pass.cpp @@ -20,9 +20,9 @@ constexpr bool test() { { - std::ranges::transform_view transformView1; + std::ranges::transform_view transformView1; auto iter1 = std::move(transformView1).begin(); - std::ranges::transform_view transformView2; + std::ranges::transform_view transformView2; auto iter2 = std::move(transformView2).begin(); assert(iter1 == iter2); assert(iter1 + 1 != iter2); @@ -39,9 +39,9 @@ constexpr bool test() { // TODO: when three_way_comparable is implemented and std::is_eq is implemented, // uncomment this. // { -// std::ranges::transform_view transformView1; +// std::ranges::transform_view transformView1; // auto iter1 = transformView1.begin(); -// std::ranges::transform_view transformView2; +// std::ranges::transform_view transformView2; // auto iter2 = transformView2.begin(); // // assert(std::is_eq(iter1 <=> iter2)); @@ -52,8 +52,8 @@ constexpr bool test() { // assert(std::is_gt(iter2 <=> iter1)); // assert(std::is_gteq(iter2 <=> iter1)); // -// static_assert( std::three_way_comparable>>); -// static_assert(!std::three_way_comparable>>); +// static_assert( std::three_way_comparable>>); +// static_assert(!std::three_way_comparable>>); // } return true; diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/ctor.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/ctor.pass.cpp index f96b8e0b36f47..5e333f8d8d1f5 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/ctor.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/ctor.pass.cpp @@ -57,16 +57,16 @@ struct IterNoDefaultInitView : std::ranges::view_base { }; constexpr bool test() { - std::ranges::transform_view transformView; + std::ranges::transform_view transformView; auto iter = std::move(transformView).begin(); - std::ranges::iterator_t> i2(iter); + std::ranges::iterator_t> i2(iter); (void)i2; - std::ranges::iterator_t> constIter(iter); + std::ranges::iterator_t> constIter(iter); (void)constIter; - static_assert( std::default_initializable>>); - static_assert(!std::default_initializable>>); + static_assert( std::default_initializable>>); + static_assert(!std::default_initializable>>); return true; } diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp index 2dc761f156c76..f1bc26d152a5c 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp @@ -18,27 +18,46 @@ #include "../types.h" int main(int, char**) { - int buff[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - { - std::ranges::transform_view transformView(ContiguousView{buff}, Increment{}); + int buff[] = {0, 1, 2, 3, 4, 5, 6, 7}; + using View = std::ranges::transform_view; + View transformView(ContiguousView{buff}, PlusOne{}); assert(*transformView.begin() == 1); + static_assert(!noexcept(*std::declval>())); + ASSERT_SAME_TYPE(int, decltype(*std::declval().begin())); + } + { + int buff[] = {0, 1, 2, 3, 4, 5, 6, 7}; + using View = std::ranges::transform_view; + View transformView(ContiguousView{buff}, PlusOneMutable{}); + assert(*transformView.begin() == 1); + static_assert(!noexcept(*std::declval>())); + ASSERT_SAME_TYPE(int, decltype(*std::declval().begin())); + } + { + int buff[] = {0, 1, 2, 3, 4, 5, 6, 7}; + using View = std::ranges::transform_view; + View transformView(ContiguousView{buff}, PlusOneNoexcept{}); + assert(*transformView.begin() == 1); + static_assert(noexcept(*std::declval>())); + ASSERT_SAME_TYPE(int, decltype(*std::declval().begin())); + } + { + int buff[] = {0, 1, 2, 3, 4, 5, 6, 7}; + using View = std::ranges::transform_view; + View transformView(ContiguousView{buff}, Increment{}); + assert(*transformView.begin() == 1); + static_assert(!noexcept(*std::declval>())); + ASSERT_SAME_TYPE(int&, decltype(*std::declval().begin())); + } + { + int buff[] = {0, 1, 2, 3, 4, 5, 6, 7}; + using View = std::ranges::transform_view; + View transformView(ContiguousView{buff}, IncrementRvalueRef{}); + assert(*transformView.begin() == 1); + static_assert(!noexcept(*std::declval>())); + ASSERT_SAME_TYPE(int&&, decltype(*std::declval().begin())); } - - static_assert(!noexcept( - *std::declval>>())); - static_assert( noexcept( - *std::declval>>())); - - ASSERT_SAME_TYPE( - int, - decltype(*std::declval>().begin())); - ASSERT_SAME_TYPE( - int&, - decltype(*std::declval>().begin())); - ASSERT_SAME_TYPE( - int&&, - decltype(*std::declval>().begin())); return 0; } diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp index 68663785d4ca8..85a46d99e4904 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp @@ -21,7 +21,7 @@ constexpr bool test() { int buff[8] = {0, 1, 2, 3, 4, 5, 6, 7}; { - std::ranges::transform_view transformView(ContiguousView{buff}, Increment{}); + std::ranges::transform_view transformView(ContiguousView{buff}, PlusOneMutable{}); auto iter = transformView.begin(); static_assert(!noexcept(std::ranges::iter_move(iter))); @@ -34,9 +34,9 @@ constexpr bool test() { { static_assert( noexcept(std::ranges::iter_move( - std::declval>&>()))); + std::declval>&>()))); static_assert(!noexcept(std::ranges::iter_move( - std::declval>&>()))); + std::declval>&>()))); } return true; diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/plus_minus.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/plus_minus.pass.cpp index a003945b9f23a..4a97d75958a9c 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/plus_minus.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/plus_minus.pass.cpp @@ -18,9 +18,9 @@ #include "../types.h" constexpr bool test() { - std::ranges::transform_view transformView1; + std::ranges::transform_view transformView1; auto iter1 = std::move(transformView1).begin(); - std::ranges::transform_view transformView2; + std::ranges::transform_view transformView2; auto iter2 = std::move(transformView2).begin(); iter1 += 4; assert((iter1 + 1).base() == globalBuff + 5); diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/requirements.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/requirements.compile.pass.cpp index ae7485b5d215b..2932ad14a6fcc 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/requirements.compile.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/requirements.compile.pass.cpp @@ -17,8 +17,8 @@ #include "test_macros.h" #include "../types.h" -static_assert(std::ranges::bidirectional_range>); -static_assert(!std::ranges::bidirectional_range>); +static_assert(std::ranges::bidirectional_range>); +static_assert(!std::ranges::bidirectional_range>); -static_assert(std::ranges::random_access_range>); -static_assert(!std::ranges::random_access_range>); +static_assert(std::ranges::random_access_range>); +static_assert(!std::ranges::random_access_range>); diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/sentinel.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/sentinel.pass.cpp index d12ece51c0c97..1c4ef28f9d38e 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/sentinel.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/sentinel.pass.cpp @@ -21,18 +21,18 @@ template concept EndIsIter = requires(T t) { ++t.end(); }; constexpr bool test() { - std::ranges::transform_view transformView1; + std::ranges::transform_view transformView1; // Going to const and back. auto sent1 = transformView1.end(); - std::ranges::sentinel_t> sent2{sent1}; - std::ranges::sentinel_t> sent3{sent2}; + std::ranges::sentinel_t> sent2{sent1}; + std::ranges::sentinel_t> sent3{sent2}; (void)sent3; static_assert(!EndIsIter); static_assert(!EndIsIter); assert(sent1.base() == globalBuff + 8); - std::ranges::transform_view transformView2(SizedSentinelView{4}, IncrementConst()); + std::ranges::transform_view transformView2(SizedSentinelView{4}, PlusOne()); auto sent4 = transformView2.end(); auto iter = transformView1.begin(); { diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/subscript.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/subscript.pass.cpp index 90a673df15a01..567f646fbd14d 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/subscript.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/subscript.pass.cpp @@ -19,22 +19,22 @@ constexpr bool test() { int buff[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - std::ranges::transform_view transformView1(ContiguousView{buff}, Increment{}); + std::ranges::transform_view transformView1(ContiguousView{buff}, PlusOneMutable{}); auto iter1 = std::move(transformView1).begin() + 1; assert(iter1[0] == 2); assert(iter1[4] == 6); static_assert(!noexcept( - std::declval>>()[0])); + std::declval>>()[0])); static_assert( noexcept( - std::declval>>()[0])); + std::declval>>()[0])); ASSERT_SAME_TYPE( int, - decltype(std::declval>().begin()[0])); + decltype(std::declval>().begin()[0])); ASSERT_SAME_TYPE( int&, - decltype(std::declval>().begin()[0])); + decltype(std::declval>().begin()[0])); ASSERT_SAME_TYPE( int&&, decltype(std::declval>().begin()[0])); diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/types.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/types.pass.cpp index 7c03c47bcdc8c..de7747023f5a4 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/types.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/types.pass.cpp @@ -29,7 +29,7 @@ constexpr bool test() { static_assert(std::same_as::iterator_concept, std::contiguous_iterator_tag>); static_assert(std::same_as::iterator_category, std::random_access_iterator_tag>); - using TView = std::ranges::transform_view; + using TView = std::ranges::transform_view; using TIter = std::ranges::iterator_t; static_assert(std::same_as); static_assert(std::same_as); @@ -38,7 +38,7 @@ constexpr bool test() { } { // Member typedefs for random access iterator. - using TView = std::ranges::transform_view; + using TView = std::ranges::transform_view; using TIter = std::ranges::iterator_t; static_assert(std::same_as); static_assert(std::same_as); @@ -47,7 +47,7 @@ constexpr bool test() { } { // Member typedefs for random access iterator/not-lvalue-ref. - using TView = std::ranges::transform_view; + using TView = std::ranges::transform_view; using TIter = std::ranges::iterator_t; static_assert(std::same_as); static_assert(std::same_as); // Note: this is now input_iterator_tag. @@ -56,7 +56,7 @@ constexpr bool test() { } { // Member typedefs for bidirectional iterator. - using TView = std::ranges::transform_view; + using TView = std::ranges::transform_view; using TIter = std::ranges::iterator_t; static_assert(std::same_as); static_assert(std::same_as); @@ -65,7 +65,7 @@ constexpr bool test() { } { // Member typedefs for forward iterator. - using TView = std::ranges::transform_view; + using TView = std::ranges::transform_view; using TIter = std::ranges::iterator_t; static_assert(std::same_as); static_assert(std::same_as); @@ -74,10 +74,10 @@ constexpr bool test() { } { // Member typedefs for input iterator. - using TView = std::ranges::transform_view; + using TView = std::ranges::transform_view; using TIter = std::ranges::iterator_t; static_assert(std::same_as); - static_assert(!HasIterCategory); + static_assert(!HasIterCategory); static_assert(std::same_as); static_assert(std::same_as); } diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/size.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/size.pass.cpp index d0de0e7a72c6a..0402d44f66447 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/size.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/size.pass.cpp @@ -23,19 +23,19 @@ concept SizeInvocable = requires(T t) { t.size(); }; constexpr bool test() { { - std::ranges::transform_view transformView(ContiguousView{}, Increment{}); + std::ranges::transform_view transformView(ContiguousView{}, PlusOne{}); assert(transformView.size() == 8); } { - const std::ranges::transform_view transformView(ContiguousView{globalBuff, 4}, Increment{}); + const std::ranges::transform_view transformView(ContiguousView{globalBuff, 4}, PlusOne{}); assert(transformView.size() == 4); } - static_assert(!SizeInvocable>); + static_assert(!SizeInvocable>); - static_assert(SizeInvocable>); - static_assert(!SizeInvocable>); + static_assert(SizeInvocable>); + static_assert(!SizeInvocable>); return true; } diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/types.h b/libcxx/test/std/ranges/range.adaptors/range.transform/types.h index 429a5ed969d91..159db9b4c97bf 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.transform/types.h +++ b/libcxx/test/std/ranges/range.adaptors/range.transform/types.h @@ -129,15 +129,15 @@ struct ThreeWayCompView : std::ranges::view_base { constexpr ThreeWayCompIter end() const { return ThreeWayCompIter(globalBuff + 8); } }; -struct Increment { +struct PlusOneMutable { constexpr int operator()(int x) { return x + 1; } }; -struct IncrementConst { +struct PlusOne { constexpr int operator()(int x) const { return x + 1; } }; -struct IncrementRef { +struct Increment { constexpr int& operator()(int& x) { return ++x; } }; @@ -145,7 +145,7 @@ struct IncrementRvalueRef { constexpr int&& operator()(int& x) { return std::move(++x); } }; -struct IncrementNoexcept { +struct PlusOneNoexcept { constexpr int operator()(int x) noexcept { return x + 1; } }; From ef198cd99e6bac3a2e87adb6c8a18fb461056fa6 Mon Sep 17 00:00:00 2001 From: Dylan Fleming Date: Tue, 17 Aug 2021 14:00:47 +0100 Subject: [PATCH 206/700] [SVE] Remove usage of getMaxVScale for AArch64, in favour of IR Attribute Removed AArch64 usage of the getMaxVScale interface, replacing it with the vscale_range(min, max) IR Attribute. Reviewed By: paulwalker-arm Differential Revision: https://reviews.llvm.org/D106277 --- clang/include/clang/Basic/TargetInfo.h | 5 +++++ clang/lib/Basic/Targets/AArch64.cpp | 11 ++++++++++ clang/lib/Basic/Targets/AArch64.h | 3 +++ clang/lib/CodeGen/CodeGenFunction.cpp | 12 ++++++----- .../arm-sve-vector-bits-vscale-range.c | 5 ++++- .../AArch64/AArch64TargetTransformInfo.cpp | 2 +- .../AArch64/AArch64TargetTransformInfo.h | 21 +++++++++++-------- .../Transforms/Vectorize/LoopVectorize.cpp | 7 +++++++ .../Analysis/CostModel/AArch64/sve-gather.ll | 2 +- .../Analysis/CostModel/AArch64/sve-scatter.ll | 2 +- .../AArch64/first-order-recurrence.ll | 5 +++-- .../AArch64/scalable-strict-fadd.ll | 13 ++++++------ .../AArch64/scalable-vectorization.ll | 12 ++++++----- .../LoopVectorize/AArch64/scalable-vf-hint.ll | 17 ++++++++------- .../AArch64/sve-cond-inv-loads.ll | 7 ++++--- .../AArch64/sve-gather-scatter.ll | 12 ++++++----- .../LoopVectorize/AArch64/sve-inv-store.ll | 2 +- .../AArch64/sve-large-strides.ll | 8 +++---- .../AArch64/sve-strict-fadd-cost.ll | 2 +- .../LoopVectorize/AArch64/sve-widen-phi.ll | 6 +++--- 20 files changed, 98 insertions(+), 56 deletions(-) diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index c7a57a7dba9a8..21289b0dfd04c 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -871,6 +871,11 @@ class TargetInfo : public virtual TransferrableTargetInfo, /// across the current set of primary and secondary targets. virtual ArrayRef getTargetBuiltins() const = 0; + /// Returns target-specific min and max values VScale_Range. + virtual Optional> + getVScaleRange(const LangOptions &LangOpts) const { + return None; + } /// The __builtin_clz* and __builtin_ctz* built-in /// functions are specified to have undefined results for zero inputs, but /// on targets that support these operations in a way that provides diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index e163ebfa2348b..2b5bf34a7b23f 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -424,6 +424,17 @@ ArrayRef AArch64TargetInfo::getTargetBuiltins() const { Builtin::FirstTSBuiltin); } +Optional> +AArch64TargetInfo::getVScaleRange(const LangOptions &LangOpts) const { + if (LangOpts.ArmSveVectorBits) { + unsigned VScale = LangOpts.ArmSveVectorBits / 128; + return std::pair(VScale, VScale); + } + if (hasFeature("sve")) + return std::pair(0, 16); + return None; +} + bool AArch64TargetInfo::hasFeature(StringRef Feature) const { return Feature == "aarch64" || Feature == "arm64" || Feature == "arm" || (Feature == "neon" && (FPU & NeonMode)) || diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index 46882a808336b..12830348fb453 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -96,6 +96,9 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { ArrayRef getTargetBuiltins() const override; + Optional> + getVScaleRange(const LangOptions &LangOpts) const override; + bool hasFeature(StringRef Feature) const override; bool handleTargetFeatures(std::vector &Features, DiagnosticsEngine &Diags) override; diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index f5eed8572daa3..dca42045325df 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -484,11 +484,13 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) { // function. CurFn->addFnAttr("min-legal-vector-width", llvm::utostr(LargestVectorWidth)); - // Add vscale attribute if appropriate. - if (getLangOpts().ArmSveVectorBits) { - unsigned VScale = getLangOpts().ArmSveVectorBits / 128; - CurFn->addFnAttr(llvm::Attribute::getWithVScaleRangeArgs(getLLVMContext(), - VScale, VScale)); + // Add vscale_range attribute if appropriate. + Optional> VScaleRange = + getContext().getTargetInfo().getVScaleRange(getLangOpts()); + if (VScaleRange) { + CurFn->addFnAttr(llvm::Attribute::getWithVScaleRangeArgs( + getLLVMContext(), VScaleRange.getValue().first, + VScaleRange.getValue().second)); } // If we generated an unreachable return block, delete it now. diff --git a/clang/test/CodeGen/arm-sve-vector-bits-vscale-range.c b/clang/test/CodeGen/arm-sve-vector-bits-vscale-range.c index 84541f9cb12db..eb5c4f31044cf 100644 --- a/clang/test/CodeGen/arm-sve-vector-bits-vscale-range.c +++ b/clang/test/CodeGen/arm-sve-vector-bits-vscale-range.c @@ -3,10 +3,13 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -msve-vector-bits=512 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=512 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -msve-vector-bits=1024 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=1024 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -msve-vector-bits=2048 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=2048 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -msve-vector-bits=128 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=128 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -msve-vector-bits=256 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=256 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -msve-vector-bits=scalable -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-NONE // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -msve-vector-bits=scalable -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-NONE // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-NONE // CHECK-LABEL: @func() #0 // CHECK: attributes #0 = { {{.*}} vscale_range([[#div(VBITS,128)]],[[#div(VBITS,128)]]) {{.*}} } -// CHECK-NONE-NOT: vscale_range +// CHECK-NONE: attributes #0 = { {{.*}} vscale_range(0,16) {{.*}} } void func() {} diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 882f1c01664fc..1c20dddfbf4b9 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1589,7 +1589,7 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost( ElementCount LegalVF = LT.second.getVectorElementCount(); InstructionCost MemOpCost = getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); - return LT.first * MemOpCost * getMaxNumElements(LegalVF); + return LT.first * MemOpCost * getMaxNumElements(LegalVF, I->getFunction()); } bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 5369eb7b7e828..5c095048ba0a3 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -125,22 +125,25 @@ class AArch64TTIImpl : public BasicTTIImplBase { return ST->getMinVectorRegisterBitWidth(); } - Optional getMaxVScale() const { - if (ST->hasSVE()) - return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock; - return BaseT::getMaxVScale(); - } /// Try to return an estimate cost factor that can be used as a multiplier /// when scalarizing an operation for a vector with ElementCount \p VF. /// For scalable vectors this currently takes the most pessimistic view based /// upon the maximum possible value for vscale. - unsigned getMaxNumElements(ElementCount VF) const { + unsigned getMaxNumElements(ElementCount VF, + const Function *F = nullptr) const { if (!VF.isScalable()) return VF.getFixedValue(); - Optional MaxNumVScale = getMaxVScale(); - assert(MaxNumVScale && "Expected valid max vscale value"); - return *MaxNumVScale * VF.getKnownMinValue(); + + unsigned MaxNumVScale = 16; + if (F && F->hasFnAttribute(Attribute::VScaleRange)) { + unsigned VScaleMax = + F->getFnAttribute(Attribute::VScaleRange).getVScaleRangeArgs().second; + if (VScaleMax > 0) + MaxNumVScale = VScaleMax; + } + + return MaxNumVScale * VF.getKnownMinValue(); } unsigned getMaxInterleaveFactor(unsigned VF); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b842d15fe1874..00416efb03253 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5636,6 +5636,13 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { // Limit MaxScalableVF by the maximum safe dependence distance. Optional MaxVScale = TTI.getMaxVScale(); + if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { + unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange) + .getVScaleRangeArgs() + .second; + if (VScaleMax > 0) + MaxVScale = VScaleMax; + } MaxScalableVF = ElementCount::getScalable( MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); if (!MaxScalableVF) diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll index 302c191d4fc1d..866e038f14544 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll @@ -2,7 +2,7 @@ ; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s -define void @masked_gathers( %nxv4i1mask, %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, %nxv1i1mask) { +define void @masked_gathers( %nxv4i1mask, %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, %nxv1i1mask) vscale_range(0, 16) { ; CHECK-LABEL: 'masked_gathers' ; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res.nxv4i32 = call @llvm.masked.gather.nxv4i32.nxv4p0i32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res.nxv8i32 = call @llvm.masked.gather.nxv8i32.nxv8p0i32 diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll index 503e6ce5e104f..a5fa33277b79e 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll @@ -2,7 +2,7 @@ ; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s -define void @masked_scatters( %nxv4i1mask, %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, %nxv1i1mask) { +define void @masked_scatters( %nxv4i1mask, %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, %nxv1i1mask) vscale_range(0, 16) { ; CHECK-LABEL: 'masked_scatters' ; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32 ; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll index 1669f4aa476ea..f9065a6126574 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll @@ -18,7 +18,7 @@ ; return a; ; } ; -define i32 @PR33613(double* %b, double %j, i32 %d) { +define i32 @PR33613(double* %b, double %j, i32 %d) #0 { ; CHECK-VF4UF2-LABEL: @PR33613 ; CHECK-VF4UF2: vector.body ; CHECK-VF4UF2: %[[VEC_RECUR:.*]] = phi [ {{.*}}, %vector.ph ], [ {{.*}}, %vector.body ] @@ -66,7 +66,7 @@ for.body: ; } ; ; Check that the sext sank after the load in the vector loop. -define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) { +define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) #0 { ; CHECK-VF4UF1-LABEL: @PR34711 ; CHECK-VF4UF1: vector.body ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi [ %vector.recur.init, %vector.ph ], [ %[[MGATHER:.*]], %vector.body ] @@ -100,5 +100,6 @@ for.end: ret void } +attributes #0 = { vscale_range(0, 16) } !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index 007670324fadc..cba948ed1dae0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -4,7 +4,7 @@ ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=true -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED -define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { +define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-LABEL: @fadd_strict ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] @@ -49,7 +49,7 @@ for.end: ret float %add } -define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) { +define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-LABEL: @fadd_strict_unroll ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ] @@ -113,7 +113,7 @@ for.end: ret float %add } -define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { +define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { ; CHECK-ORDERED-LABEL: @fadd_strict_interleave ; CHECK-ORDERED: entry ; CHECK-ORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1 @@ -206,7 +206,7 @@ for.end: ret void } -define float @fadd_of_sum(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { +define float @fadd_of_sum(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { ; CHECK-ORDERED-LABEL: @fadd_of_sum ; CHECK-ORDERED: vector.body ; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] @@ -268,7 +268,7 @@ for.end: ; preds = %for.body, %entry ret float %res } -define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { +define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { ; CHECK-ORDERED-LABEL: @fadd_conditional ; CHECK-ORDERED: vector.body ; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] @@ -343,7 +343,7 @@ for.end: } ; Negative test - loop contains multiple fadds which we cannot safely reorder -define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) { +define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) #0 { ; CHECK-ORDERED-LABEL: @fadd_multiple ; CHECK-ORDERED-NOT: vector.body @@ -390,6 +390,7 @@ for.end: ; preds = %for.body ret float %rdx } +attributes #0 = { vscale_range(0, 16) } !0 = distinct !{!0, !3, !6, !8} !1 = distinct !{!1, !3, !7, !8} !2 = distinct !{!2, !4, !6, !8} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll index 1d9d9d8545408..8d53ae5a0b5d9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll @@ -7,7 +7,7 @@ ; Test that the MaxVF for the following loop, that has no dependence distances, ; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16 ; (maximized bandwidth for i8 in the loop). -define void @test0(i32* %a, i8* %b, i32* %c) { +define void @test0(i32* %a, i8* %b, i32* %c) #0 { ; CHECK: LV: Checking a loop in "test0" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4 @@ -40,7 +40,7 @@ exit: ; Test that the MaxVF for the following loop, with a dependence distance ; of 64 elements, is calculated as (maxvscale = 16) * 4. -define void @test1(i32* %a, i8* %b) { +define void @test1(i32* %a, i8* %b) #0 { ; CHECK: LV: Checking a loop in "test1" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4 @@ -74,7 +74,7 @@ exit: ; Test that the MaxVF for the following loop, with a dependence distance ; of 32 elements, is calculated as (maxvscale = 16) * 2. -define void @test2(i32* %a, i8* %b) { +define void @test2(i32* %a, i8* %b) #0 { ; CHECK: LV: Checking a loop in "test2" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4 @@ -108,7 +108,7 @@ exit: ; Test that the MaxVF for the following loop, with a dependence distance ; of 16 elements, is calculated as (maxvscale = 16) * 1. -define void @test3(i32* %a, i8* %b) { +define void @test3(i32* %a, i8* %b) #0 { ; CHECK: LV: Checking a loop in "test3" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4 @@ -142,7 +142,7 @@ exit: ; Test the fallback mechanism when scalable vectors are not feasible due ; to e.g. dependence distance. -define void @test4(i32* %a, i32* %b) { +define void @test4(i32* %a, i32* %b) #0 { ; CHECK: LV: Checking a loop in "test4" ; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF ; CHECK_SCALABLE_ON: LV: Selecting VF: 4 @@ -172,3 +172,5 @@ loop: exit: ret void } + +attributes #0 = { vscale_range(0, 16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll index a04b3c759e9b0..246dcd2370880 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll @@ -45,7 +45,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test1 ; CHECK: <4 x i32> -define void @test1(i32* %a, i32* %b) { +define void @test1(i32* %a, i32* %b) #0 { entry: br label %loop @@ -90,7 +90,7 @@ exit: ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test2 ; CHECK: <4 x i32> -define void @test2(i32* %a, i32* %b) { +define void @test2(i32* %a, i32* %b) #0 { entry: br label %loop @@ -138,7 +138,7 @@ exit: ; CHECK-DBG: LV: Using user VF vscale x 2. ; CHECK-LABEL: @test3 ; CHECK: -define void @test3(i32* %a, i32* %b) { +define void @test3(i32* %a, i32* %b) #0 { entry: br label %loop @@ -190,7 +190,7 @@ exit: ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test4 ; CHECK: <4 x i32> -define void @test4(i32* %a, i32* %b) { +define void @test4(i32* %a, i32* %b) #0 { entry: br label %loop @@ -238,7 +238,7 @@ exit: ; CHECK-DBG: LV: Using user VF vscale x 4 ; CHECK-LABEL: @test5 ; CHECK: -define void @test5(i32* %a, i32* %b) { +define void @test5(i32* %a, i32* %b) #0 { entry: br label %loop @@ -289,7 +289,7 @@ exit: ; CHECK-DBG: Selecting VF: vscale x 4. ; CHECK-LABEL: @test6 ; CHECK: -define void @test6(i32* %a, i32* %b) { +define void @test6(i32* %a, i32* %b) #0 { entry: br label %loop @@ -322,7 +322,7 @@ exit: ; CHECK-NO-SVE-LABEL: @test_no_sve ; CHECK-NO-SVE: <4 x i32> ; CHECK-NO-SVE-NOT: -define void @test_no_sve(i32* %a, i32* %b) { +define void @test_no_sve(i32* %a, i32* %b) #0 { entry: br label %loop @@ -356,7 +356,7 @@ exit: ; CHECK-DBG: LV: Selecting VF: 4. ; CHECK-LABEL: @test_no_max_vscale ; CHECK: <4 x i32> -define void @test_no_max_vscale(i32* %a, i32* %b) { +define void @test_no_max_vscale(i32* %a, i32* %b) #0 { entry: br label %loop @@ -378,6 +378,7 @@ exit: ret void } +attributes #0 = { vscale_range(0, 16) } !21 = !{!21, !22, !23} !22 = !{!"llvm.loop.vectorize.width", i32 4} !23 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll index 3054f3a6ac971..bc083a2bc870c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll @@ -1,6 +1,6 @@ ; RUN: opt -loop-vectorize -scalable-vectorization=on -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -o - | FileCheck %s -define void @cond_inv_load_i32i32i16(i32* noalias nocapture %a, i32* noalias nocapture readonly %cond, i16* noalias nocapture readonly %inv, i64 %n) { +define void @cond_inv_load_i32i32i16(i32* noalias nocapture %a, i32* noalias nocapture readonly %cond, i16* noalias nocapture readonly %inv, i64 %n) #0 { ; CHECK-LABEL: @cond_inv_load_i32i32i16 ; CHECK: vector.ph: ; CHECK: %[[INVINS:.*]] = insertelement poison, i16* %inv, i32 0 @@ -39,7 +39,7 @@ exit: ; preds = %for.inc ret void } -define void @cond_inv_load_f64f64f64(double* noalias nocapture %a, double* noalias nocapture readonly %cond, double* noalias nocapture readonly %inv, i64 %n) { +define void @cond_inv_load_f64f64f64(double* noalias nocapture %a, double* noalias nocapture readonly %cond, double* noalias nocapture readonly %inv, i64 %n) #0 { ; CHECK-LABEL: @cond_inv_load_f64f64f64 ; CHECK: vector.ph: ; CHECK: %[[INVINS:.*]] = insertelement poison, double* %inv, i32 0 @@ -76,7 +76,7 @@ exit: ; preds = %for.inc ret void } -define void @invariant_load_cond(i32* noalias nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %cond, i64 %n) { +define void @invariant_load_cond(i32* noalias nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %cond, i64 %n) #0 { ; CHECK-LABEL: @invariant_load_cond ; CHECK: vector.body ; CHECK: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42 @@ -117,6 +117,7 @@ for.end: ret void } +attributes #0 = { vscale_range(0, 16) } !0 = distinct !{!0, !1, !2, !3, !4, !5} !1 = !{!"llvm.loop.mustprogress"} !2 = !{!"llvm.loop.vectorize.width", i32 4} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll index 4fbad7ab4e247..a2760c79a838e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll @@ -1,6 +1,6 @@ ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -scalable-vectorization=preferred -force-target-instruction-cost=1 -o - | FileCheck %s -define void @gather_nxv4i32_ind64(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) { +define void @gather_nxv4i32_ind64(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 { ; CHECK-LABEL: @gather_nxv4i32_ind64 ; CHECK: vector.body: ; CHECK: %[[IND:.*]] = load , * @@ -29,7 +29,7 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo ; NOTE: I deliberately chose '%b' as an array of i32 indices, since the ; additional 'sext' in the for.body loop exposes additional code paths ; during vectorisation. -define void @scatter_nxv4i32_ind32(float* noalias nocapture %a, i32* noalias nocapture readonly %b, float* noalias nocapture readonly %c, i64 %n) { +define void @scatter_nxv4i32_ind32(float* noalias nocapture %a, i32* noalias nocapture readonly %b, float* noalias nocapture readonly %c, i64 %n) #0 { ; CHECK-LABEL: @scatter_nxv4i32_ind32 ; CHECK: vector.body: ; CHECK: %[[VALS:.*]] = load @@ -57,7 +57,7 @@ for.cond.cleanup: ; preds = %for.body, %entry ret void } -define void @scatter_inv_nxv4i32(i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) { +define void @scatter_inv_nxv4i32(i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 { ; CHECK-LABEL: @scatter_inv_nxv4i32 ; CHECK: vector.ph: ; CHECK: %[[INS:.*]] = insertelement poison, i32* %inv, i32 0 @@ -89,7 +89,7 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo ret void } -define void @gather_inv_nxv4i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %inv, i64 %n) { +define void @gather_inv_nxv4i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %inv, i64 %n) #0 { ; CHECK-LABEL: @gather_inv_nxv4i32 ; CHECK: vector.ph: ; CHECK: %[[INS:.*]] = insertelement poison, i32* %inv, i32 0 @@ -124,7 +124,7 @@ for.cond.cleanup: ; preds = %for.inc, %entry -define void @gather_nxv4i32_ind64_stride2(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { +define void @gather_nxv4i32_ind64_stride2(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 { ; CHECK-LABEL: @gather_nxv4i32_ind64_stride2 ; CHECK: vector.body: ; CHECK: %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] @@ -153,6 +153,8 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo ret void } +attributes #0 = { vscale_range(0, 16) } + !0 = distinct !{!0, !1, !2, !3, !4, !5} !1 = !{!"llvm.loop.mustprogress"} !2 = !{!"llvm.loop.vectorize.width", i32 4} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll index 0e02af631d205..b534171274047 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll @@ -59,7 +59,7 @@ for.end: ; preds = %for.inc, %entry ret void } -attributes #0 = { "target-features"="+neon,+sve" } +attributes #0 = { "target-features"="+neon,+sve" vscale_range(0, 16) } !0 = distinct !{!0, !1, !2, !3, !4, !5} !1 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll index 8327e09063b68..23eb2d0b0aba0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll @@ -1,6 +1,6 @@ ; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -scalable-vectorization=on -dce -instcombine -S <%s | FileCheck %s -define void @stride7_i32(i32* noalias nocapture %dst, i64 %n) { +define void @stride7_i32(i32* noalias nocapture %dst, i64 %n) #0 { ; CHECK-LABEL: @stride7_i32( ; CHECK: vector.body ; CHECK: %[[VEC_IND:.*]] = phi [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] @@ -27,7 +27,7 @@ for.end: ; preds = %for.end.loopexit, % ret void } -define void @stride7_f64(double* noalias nocapture %dst, i64 %n) { +define void @stride7_f64(double* noalias nocapture %dst, i64 %n) #0 { ; CHECK-LABEL: @stride7_f64( ; CHECK: vector.body ; CHECK: %[[VEC_IND:.*]] = phi [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ] @@ -55,7 +55,7 @@ for.end: ; preds = %for.end.loopexit, % } -define void @cond_stride7_f64(double* noalias nocapture %dst, i64* noalias nocapture readonly %cond, i64 %n) { +define void @cond_stride7_f64(double* noalias nocapture %dst, i64* noalias nocapture readonly %cond, i64 %n) #0 { ; CHECK-LABEL: @cond_stride7_f64( ; CHECK: vector.body ; CHECK: %[[MASK:.*]] = icmp ne @@ -90,7 +90,7 @@ for.end: ; preds = %for.end.loopexit, % ret void } - +attributes #0 = { vscale_range(0, 16) } !0 = distinct !{!0, !1, !2, !3, !4, !5} !1 = !{!"llvm.loop.mustprogress"} !2 = !{!"llvm.loop.vectorize.width", i32 4} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll index 1aef842b297fb..0221c890a6e1b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll @@ -49,7 +49,7 @@ for.end: ret double %add } -attributes #0 = { "target-features"="+sve" } +attributes #0 = { "target-features"="+sve" vscale_range(0, 16) } !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index afa2bd093c273..1881801ec2579 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -12,7 +12,7 @@ ; that we can use gather instructions with the correct offsets, taking ; vscale into account. -define void @widen_ptr_phi_unrolled(i32* noalias nocapture %a, i32* noalias nocapture %b, i32* nocapture readonly %c, i64 %n) { +define void @widen_ptr_phi_unrolled(i32* noalias nocapture %a, i32* noalias nocapture %b, i32* nocapture readonly %c, i64 %n) #0 { ; CHECK-LABEL: @widen_ptr_phi_unrolled( ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i32* [ %c, %vector.ph ], [ %[[PTR_IND:.*]], %vector.body ] @@ -122,7 +122,7 @@ for.cond.cleanup: ; preds = %for.body ; because it is stored to memory. ; -define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) { +define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 { ; CHECK-LABEL: @pointer_iv_mixed( ; CHECK: vector.body ; CHECK: %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ] @@ -170,7 +170,7 @@ for.end: ret i32 %tmp5 } - +attributes #0 = { vscale_range(0, 16) } !0 = distinct !{!0, !1, !2, !3, !4, !5} !1 = !{!"llvm.loop.mustprogress"} !2 = !{!"llvm.loop.vectorize.width", i32 4} From 3c4fad6a69887311f4e9feeca28126a89f314d45 Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Tue, 17 Aug 2021 14:05:28 +0000 Subject: [PATCH 207/700] [LIT]Accept cat_64 command name on AIX in shtest AIX may use cat_64 for 64 bit cat, this is just update the lit test to accept the name as well. Reviewed By: #powerpc, shchenz Differential Revision: https://reviews.llvm.org/D108149 --- llvm/utils/lit/tests/shtest-format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/lit/tests/shtest-format.py b/llvm/utils/lit/tests/shtest-format.py index a15c0f9501740..4d8004db267a6 100644 --- a/llvm/utils/lit/tests/shtest-format.py +++ b/llvm/utils/lit/tests/shtest-format.py @@ -20,7 +20,7 @@ # CHECK-NEXT: line 2: failed test output on stdout # CHECK: Command Output (stderr): # CHECK-NEXT: -- -# CHECK-NEXT: cat{{(\.exe)?}}: {{cannot open does-not-exist|does-not-exist: No such file or directory}} +# CHECK-NEXT: cat{{(_64)?(\.exe)?}}: {{cannot open does-not-exist|does-not-exist: No such file or directory}} # CHECK: -- # CHECK: FAIL: shtest-format :: external_shell/fail_with_bad_encoding.txt From a14920c0027e73fbe9cd971b914a46c0d754ec1a Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 17 Aug 2021 07:10:51 -0700 Subject: [PATCH 208/700] [Bitcode] Remove unused declaration writeBitcodeHeader (NFC) The corresponding definition was removed on Nov 29, 2016 in commit 5a0a2e648c267d99111b21482ca709f580e9ccc2. --- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 0a202c376981a..f7c186f16a873 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -142,7 +142,6 @@ class BitcodeWriterBase { : Stream(Stream), StrtabBuilder(StrtabBuilder) {} protected: - void writeBitcodeHeader(); void writeModuleVersion(); }; From c67f497e7aeab28df5fd335b3c8cc89e6d52b762 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 17 Aug 2021 10:52:12 -0400 Subject: [PATCH 209/700] [libc++][NFC] Fix indentation of documentation --- libcxx/docs/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst index d4205a1bcc275..b13e42f1dd2eb 100644 --- a/libcxx/docs/index.rst +++ b/libcxx/docs/index.rst @@ -128,8 +128,8 @@ for all the platforms and compilers that we claim to support. If a platform or c is not listed here, it is not officially supported. It may happen to work, and in practice the library is known to work on some platforms not listed here, but we don't make any guarantees. If you would like your compiler and/or platform -to be formally supported and listed here, -please work with the libc++ team to set up testing for your configuration. +to be formally supported and listed here, please work with the libc++ team to set +up testing for your configuration. C++ Dialect Support From 5c6f748cbc17d4ce82374f0c4c2364961152a1c4 Mon Sep 17 00:00:00 2001 From: Tozer Date: Tue, 17 Aug 2021 15:38:45 +0100 Subject: [PATCH 210/700] [MCParser] Correctly handle CRLF line ends when consuming line comments Fixes issue: https://bugs.llvm.org/show_bug.cgi?id=47983 The AsmLexer currently has an issue with lexing line comments in files with CRLF line endings, in which it reads the carriage return as being part of the line comment. This causes an error for certain valid comment layouts; this patch fixes this by excluding the carriage return from the line comment. Differential Revision: https://reviews.llvm.org/D90234 --- llvm/.gitattributes | 3 ++- llvm/lib/MC/MCParser/AsmLexer.cpp | 3 ++- llvm/test/tools/llvm-mca/directives-handle-crlf.s | 4 ++++ 3 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 llvm/test/tools/llvm-mca/directives-handle-crlf.s diff --git a/llvm/.gitattributes b/llvm/.gitattributes index 48ddf2f02d15b..b41ae6aec97b3 100644 --- a/llvm/.gitattributes +++ b/llvm/.gitattributes @@ -13,7 +13,8 @@ test/tools/llvm-strings/Inputs/numbers binary test/MC/AsmParser/incbin_abcd binary test/YAMLParser/spec-09-02.test binary -# This file must have CRLF line endings, therefore git should treat it as +# These files must have CRLF line endings, therefore git should treat them as # binary and not autoconvert line endings (for example, when core.autocrlf is # on). test/MC/AsmParser/preserve-comments-crlf.s binary +test/tools/llvm-mca/directives-handle-crlf.s binary diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp index e328ba5315af5..bf9b9e916d6f6 100644 --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -228,6 +228,7 @@ AsmToken AsmLexer::LexLineComment() { int CurChar = getNextChar(); while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) CurChar = getNextChar(); + const char *NewlinePtr = CurPtr; if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n') ++CurPtr; @@ -235,7 +236,7 @@ AsmToken AsmLexer::LexLineComment() { if (CommentConsumer) { CommentConsumer->HandleComment( SMLoc::getFromPointer(CommentTextStart), - StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart)); + StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart)); } IsAtStartOfLine = true; diff --git a/llvm/test/tools/llvm-mca/directives-handle-crlf.s b/llvm/test/tools/llvm-mca/directives-handle-crlf.s new file mode 100644 index 0000000000000..aa5c0fc205f50 --- /dev/null +++ b/llvm/test/tools/llvm-mca/directives-handle-crlf.s @@ -0,0 +1,4 @@ +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown %s +# LLVM-MCA-BEGIN foo +addl $42, %eax +# LLVM-MCA-END From d2b574a4dea5b718e4386bf2e26af0126e5978ce Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 17 Aug 2021 16:54:07 +0200 Subject: [PATCH 211/700] tsan: test: Initialize all fields of Params struct Some compilers started complaining about the test: tsan_trace_test.cpp:128:21: error: missing field 'type' initializer Fix it by initializing all 5 fields, even though the type field will be reset in the for loop. Differential Revision: https://reviews.llvm.org/D108207 --- compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp b/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp index 571fc0ab04b87..f9d9b645335b6 100644 --- a/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp +++ b/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp @@ -125,10 +125,10 @@ TEST(Trace, MemoryAccessSize) { } }; Thread::Params tests[] = { - {1, 0, 1, true}, {4, 0, 2, true}, - {4, 2, 2, true}, {8, 3, 1, true}, - {2, 1, 1, true}, {1, 1, 1, false}, - {8, 5, 4, false}, {4, static_cast(-1l), 4, false}, + {1, 0, 1, true, 0}, {4, 0, 2, true, 0}, + {4, 2, 2, true, 0}, {8, 3, 1, true, 0}, + {2, 1, 1, true, 0}, {1, 1, 1, false, 0}, + {8, 5, 4, false, 0}, {4, static_cast(-1l), 4, false, 0}, }; for (auto params : tests) { for (params.type = 0; params.type < 3; params.type++) From ad40cb8821666a255ad9982b99ea0ce13a57dfaa Mon Sep 17 00:00:00 2001 From: Tozer Date: Tue, 17 Aug 2021 16:14:12 +0100 Subject: [PATCH 212/700] Fix: [MCParser] Correctly handle CRLF line ends when consuming line comments Fixes an issue with revision 5c6f748c. Move the test added in the above commit into the X86 folder, ensuring that it is only run on targets where its triple is valid. --- llvm/.gitattributes | 2 +- .../tools/llvm-mca/{ => X86}/directives-handle-crlf.s | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) rename llvm/test/tools/llvm-mca/{ => X86}/directives-handle-crlf.s (96%) diff --git a/llvm/.gitattributes b/llvm/.gitattributes index b41ae6aec97b3..8812053169ff6 100644 --- a/llvm/.gitattributes +++ b/llvm/.gitattributes @@ -17,4 +17,4 @@ test/YAMLParser/spec-09-02.test binary # binary and not autoconvert line endings (for example, when core.autocrlf is # on). test/MC/AsmParser/preserve-comments-crlf.s binary -test/tools/llvm-mca/directives-handle-crlf.s binary +test/tools/llvm-mca/X86/directives-handle-crlf.s binary diff --git a/llvm/test/tools/llvm-mca/directives-handle-crlf.s b/llvm/test/tools/llvm-mca/X86/directives-handle-crlf.s similarity index 96% rename from llvm/test/tools/llvm-mca/directives-handle-crlf.s rename to llvm/test/tools/llvm-mca/X86/directives-handle-crlf.s index aa5c0fc205f50..e62cdd269bc8f 100644 --- a/llvm/test/tools/llvm-mca/directives-handle-crlf.s +++ b/llvm/test/tools/llvm-mca/X86/directives-handle-crlf.s @@ -1,4 +1,4 @@ -# RUN: llvm-mca -mtriple=x86_64-unknown-unknown %s -# LLVM-MCA-BEGIN foo -addl $42, %eax -# LLVM-MCA-END +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown %s +# LLVM-MCA-BEGIN foo +addl $42, %eax +# LLVM-MCA-END From ceff0b7258aefc6e5299802c119b3d01545440f4 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 11 Aug 2021 14:26:33 -0400 Subject: [PATCH 213/700] [libc++] Do not require movability in __non_propagating_cache::__emplace_deref As explained in http://eel.is/c++draft/range.nonprop.cache#note-1, we should allow copy and move elision to happen when calling emplace_deref in non-propagating-cache. Before this change, the only way to emplace into the non-propagating-cache was to call `__set(*it)`, which materialized `*it` when binding it to the reference argument of `__set` and disabled move elision. As a fly-by change, this also renames `__set` to `__emplace` for consistency and adds tests for it. Differential Revision: https://reviews.llvm.org/D107932 --- libcxx/include/__ranges/drop_view.h | 2 +- libcxx/include/__ranges/join_view.h | 2 +- .../include/__ranges/non_propagating_cache.h | 43 ++++---- libcxx/include/__ranges/reverse_view.h | 2 +- .../range.nonprop.cache/assign.copy.pass.cpp | 8 +- .../range.nonprop.cache/assign.move.pass.cpp | 8 +- .../range.nonprop.cache/ctor.copy.pass.cpp | 2 +- .../range.nonprop.cache/ctor.move.pass.cpp | 4 +- .../ranges/range.nonprop.cache/deref.pass.cpp | 4 +- .../range.nonprop.cache/emplace.pass.cpp | 97 +++++++++++++++++++ .../range.nonprop.cache/emplace_from.pass.cpp | 79 +++++++++++++++ .../range.nonprop.cache/has_value.pass.cpp | 2 +- 12 files changed, 218 insertions(+), 35 deletions(-) create mode 100644 libcxx/test/libcxx/ranges/range.nonprop.cache/emplace.pass.cpp create mode 100644 libcxx/test/libcxx/ranges/range.nonprop.cache/emplace_from.pass.cpp diff --git a/libcxx/include/__ranges/drop_view.h b/libcxx/include/__ranges/drop_view.h index 6d1a5a2efff66..d732d02f2f2a0 100644 --- a/libcxx/include/__ranges/drop_view.h +++ b/libcxx/include/__ranges/drop_view.h @@ -77,7 +77,7 @@ namespace ranges { auto __tmp = ranges::next(ranges::begin(__base_), __count_, ranges::end(__base_)); if constexpr (_UseCache) - __cached_begin_.__set(__tmp); + __cached_begin_.__emplace(__tmp); return __tmp; } diff --git a/libcxx/include/__ranges/join_view.h b/libcxx/include/__ranges/join_view.h index 44aa1d0264e6d..9aa69da76cf0b 100644 --- a/libcxx/include/__ranges/join_view.h +++ b/libcxx/include/__ranges/join_view.h @@ -191,7 +191,7 @@ namespace ranges { if constexpr (__ref_is_glvalue) return *__outer_; else - return __parent_->__cache_.__emplace_deref(__outer_); + return __parent_->__cache_.__emplace_from([&]() -> decltype(auto) { return *__outer_; }); }(); __inner_ = ranges::begin(__inner); if (*__inner_ != ranges::end(__inner)) diff --git a/libcxx/include/__ranges/non_propagating_cache.h b/libcxx/include/__ranges/non_propagating_cache.h index 76577f47a5ad4..456e08d8c971a 100644 --- a/libcxx/include/__ranges/non_propagating_cache.h +++ b/libcxx/include/__ranges/non_propagating_cache.h @@ -13,6 +13,7 @@ #include <__iterator/concepts.h> // indirectly_readable #include <__iterator/iterator_traits.h> // iter_reference_t #include <__memory/addressof.h> +#include <__utility/forward.h> #include // constructible_from #include #include @@ -21,13 +22,8 @@ #pragma GCC system_header #endif -_LIBCPP_PUSH_MACROS -#include <__undef_macros> - _LIBCPP_BEGIN_NAMESPACE_STD -// clang-format off - #if !defined(_LIBCPP_HAS_NO_RANGES) namespace ranges { @@ -42,7 +38,20 @@ namespace ranges { template requires is_object_v<_Tp> class _LIBCPP_TEMPLATE_VIS __non_propagating_cache { - optional<_Tp> __value_ = nullopt; + struct __from_tag { }; + struct __forward_tag { }; + + // This helper class is needed to perform copy and move elision when + // constructing the contained type from an iterator. + struct __wrapper { + template + constexpr explicit __wrapper(__forward_tag, _Args&& ...__args) : __t_(_VSTD::forward<_Args>(__args)...) { } + template + constexpr explicit __wrapper(__from_tag, _Fn const& __f) : __t_(__f()) { } + _Tp __t_; + }; + + optional<__wrapper> __value_ = nullopt; public: _LIBCPP_HIDE_FROM_ABI __non_propagating_cache() = default; @@ -75,23 +84,23 @@ namespace ranges { } _LIBCPP_HIDE_FROM_ABI - constexpr _Tp& operator*() { return *__value_; } + constexpr _Tp& operator*() { return __value_->__t_; } _LIBCPP_HIDE_FROM_ABI - constexpr _Tp const& operator*() const { return *__value_; } + constexpr _Tp const& operator*() const { return __value_->__t_; } _LIBCPP_HIDE_FROM_ABI constexpr bool __has_value() const { return __value_.has_value(); } + + template _LIBCPP_HIDE_FROM_ABI - constexpr void __set(_Tp const& __value) { __value_.emplace(__value); } - _LIBCPP_HIDE_FROM_ABI - constexpr void __set(_Tp&& __value) { __value_.emplace(_VSTD::move(__value)); } + constexpr _Tp& __emplace_from(_Fn const& __f) { + return __value_.emplace(__from_tag{}, __f).__t_; + } - template + template _LIBCPP_HIDE_FROM_ABI - constexpr _Tp& __emplace_deref(const _Other& __value) { - __value_.reset(); - __value_.emplace(*__value); - return *__value_; + constexpr _Tp& __emplace(_Args&& ...__args) { + return __value_.emplace(__forward_tag{}, _VSTD::forward<_Args>(__args)...).__t_; } }; @@ -102,6 +111,4 @@ namespace ranges { _LIBCPP_END_NAMESPACE_STD -_LIBCPP_POP_MACROS - #endif // _LIBCPP___RANGES_NON_PROPAGATING_CACHE_H diff --git a/libcxx/include/__ranges/reverse_view.h b/libcxx/include/__ranges/reverse_view.h index 5953f74fd77d5..ad88dc7138053 100644 --- a/libcxx/include/__ranges/reverse_view.h +++ b/libcxx/include/__ranges/reverse_view.h @@ -64,7 +64,7 @@ namespace ranges { auto __tmp = _VSTD::make_reverse_iterator(ranges::next(ranges::begin(__base_), ranges::end(__base_))); if constexpr (_UseCache) - __cached_begin_.__set(__tmp); + __cached_begin_.__emplace(__tmp); return __tmp; } diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.copy.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.copy.pass.cpp index 393cde1199b21..108422fa4e228 100644 --- a/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.copy.pass.cpp +++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.copy.pass.cpp @@ -47,7 +47,7 @@ constexpr void test() { // Assign to an empty cache { - Cache a; a.__set(T{3}); + Cache a; a.__emplace(3); Cache b; Cache& result = (b = a); @@ -60,8 +60,8 @@ constexpr void test() { // Assign to a non-empty cache { - Cache a; a.__set(T{3}); - Cache b; b.__set(T{5}); + Cache a; a.__emplace(3); + Cache b; b.__emplace(5); Cache& result = (b = a); assert(&result == &b); @@ -81,7 +81,7 @@ constexpr void test() { // Self-assignment should not do anything (case with non-empty cache) { - Cache b; b.__set(T{5}); + Cache b; b.__emplace(5); Cache& result = (b = b); assert(&result == &b); assert(b.__has_value()); diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.move.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.move.pass.cpp index 6c5ccb100293a..5f04619832fbc 100644 --- a/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.move.pass.cpp +++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.move.pass.cpp @@ -46,7 +46,7 @@ constexpr void test() { // Assign to an empty cache { - Cache a; a.__set(T{3}); + Cache a; a.__emplace(3); Cache b; Cache& result = (b = std::move(a)); @@ -57,8 +57,8 @@ constexpr void test() { // Assign to a non-empty cache { - Cache a; a.__set(T{3}); - Cache b; b.__set(T{5}); + Cache a; a.__emplace(3); + Cache b; b.__emplace(5); Cache& result = (b = std::move(a)); assert(&result == &b); @@ -77,7 +77,7 @@ constexpr void test() { // Self-assignment should clear the cache (case with non-empty cache) { - Cache b; b.__set(T{5}); + Cache b; b.__emplace(5); Cache& result = (b = std::move(b)); assert(&result == &b); diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.copy.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.copy.pass.cpp index fae7a3e92939a..762222058e1a9 100644 --- a/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.copy.pass.cpp +++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.copy.pass.cpp @@ -39,7 +39,7 @@ constexpr void test() { using Cache = std::ranges::__non_propagating_cache; static_assert(std::is_nothrow_copy_constructible_v); Cache a; - a.__set(T{3}); + a.__emplace(3); // Test with direct initialization { diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.move.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.move.pass.cpp index 1fa454f7acc17..b28c751281d67 100644 --- a/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.move.pass.cpp +++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.move.pass.cpp @@ -34,7 +34,7 @@ constexpr void test() { // Test with direct initialization { Cache a; - a.__set(T{3}); + a.__emplace(3); Cache b(std::move(a)); assert(!b.__has_value()); // make sure we don't propagate @@ -44,7 +44,7 @@ constexpr void test() { // Test with copy initialization { Cache a; - a.__set(T{3}); + a.__emplace(3); Cache b = std::move(a); assert(!b.__has_value()); // make sure we don't propagate diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/deref.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/deref.pass.cpp index 55b288028af3d..51508c59a22aa 100644 --- a/libcxx/test/libcxx/ranges/range.nonprop.cache/deref.pass.cpp +++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/deref.pass.cpp @@ -23,14 +23,14 @@ constexpr void test() { // non-const version { - Cache cache; cache.__set(T{3}); + Cache cache; cache.__emplace(3); T& result = *cache; assert(result == T{3}); } // const version { - Cache cache; cache.__set(T{3}); + Cache cache; cache.__emplace(3); T const& result = *static_cast(cache); assert(result == T{3}); } diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/emplace.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/emplace.pass.cpp new file mode 100644 index 0000000000000..636eda8aa6d91 --- /dev/null +++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/emplace.pass.cpp @@ -0,0 +1,97 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// template +// constexpr T& __emplace(Args&& ...); + +#include + +#include +#include + +template +struct X { + int value = -1; + template + friend constexpr bool operator==(X const& a, X const& b) { return I == J && a.value == b.value; } +}; + +struct NonMovable { + int value = -1; + NonMovable() = default; + constexpr explicit NonMovable(int v) : value(v) { } + NonMovable(NonMovable&&) = delete; + NonMovable& operator=(NonMovable&&) = delete; +}; + +constexpr bool test() { + { + using T = std::tuple<>; + using Cache = std::ranges::__non_propagating_cache; + Cache cache; + T& result = cache.__emplace(); + assert(&result == &*cache); + assert(result == T()); + } + + { + using T = std::tuple>; + using Cache = std::ranges::__non_propagating_cache; + Cache cache; + T& result = cache.__emplace(); + assert(&result == &*cache); + assert(result == T()); + } + { + using T = std::tuple>; + using Cache = std::ranges::__non_propagating_cache; + Cache cache; + T& result = cache.__emplace(X<0>{0}); + assert(&result == &*cache); + assert(result == T(X<0>{0})); + } + + { + using T = std::tuple, X<1>>; + using Cache = std::ranges::__non_propagating_cache; + Cache cache; + T& result = cache.__emplace(); + assert(&result == &*cache); + assert(result == T()); + } + { + using T = std::tuple, X<1>>; + using Cache = std::ranges::__non_propagating_cache; + Cache cache; + T& result = cache.__emplace(X<0>{0}, X<1>{1}); + assert(&result == &*cache); + assert(result == T(X<0>{0}, X<1>{1})); + } + + // Make sure that we do not require the type to be movable when we emplace it. + // Move elision should be performed instead, see http://eel.is/c++draft/range.nonprop.cache#note-1. + { + using Cache = std::ranges::__non_propagating_cache; + Cache cache; + NonMovable& result = cache.__emplace(); + assert(&result == &*cache); + assert(result.value == -1); + } + + return true; +} + +int main(int, char**) { + static_assert(test()); + test(); + return 0; +} diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/emplace_from.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/emplace_from.pass.cpp new file mode 100644 index 0000000000000..d87db6570cdb9 --- /dev/null +++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/emplace_from.pass.cpp @@ -0,0 +1,79 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: libcpp-no-concepts +// UNSUPPORTED: libcpp-has-no-incomplete-ranges + +// template +// constexpr T& __emplace_from(Fn const&); + +#include + +#include +#include + +template +struct X { + int value = -1; + template + friend constexpr bool operator==(X const& a, X const& b) { return I == J && a.value == b.value; } +}; + +struct NonMovable { + int value = -1; + NonMovable() = default; + constexpr explicit NonMovable(int v) : value(v) { } + NonMovable(NonMovable&&) = delete; + NonMovable& operator=(NonMovable&&) = delete; +}; + +constexpr bool test() { + { + using T = std::tuple<>; + using Cache = std::ranges::__non_propagating_cache; + Cache cache; + T& result = cache.__emplace_from([] { return T(); }); + assert(&result == &*cache); + assert(result == T()); + } + { + using T = std::tuple>; + using Cache = std::ranges::__non_propagating_cache; + Cache cache; + T& result = cache.__emplace_from([] { return T(X<0>{0}); }); + assert(&result == &*cache); + assert(result == T(X<0>{0})); + } + { + using T = std::tuple, X<1>>; + using Cache = std::ranges::__non_propagating_cache; + Cache cache; + T& result = cache.__emplace_from([] { return T(X<0>{0}, X<1>{1}); }); + assert(&result == &*cache); + assert(result == T(X<0>{0}, X<1>{1})); + } + + // Make sure that we do not require the type to be movable when we emplace it. + // Move elision should be performed instead, see http://eel.is/c++draft/range.nonprop.cache#note-1. + { + using Cache = std::ranges::__non_propagating_cache; + Cache cache; + NonMovable& result = cache.__emplace_from([] { return NonMovable(3); }); + assert(&result == &*cache); + assert(result.value == 3); + } + + return true; +} + +int main(int, char**) { + static_assert(test()); + test(); + return 0; +} diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/has_value.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/has_value.pass.cpp index 4268cf9abe903..2b30e99ce06e7 100644 --- a/libcxx/test/libcxx/ranges/range.nonprop.cache/has_value.pass.cpp +++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/has_value.pass.cpp @@ -28,7 +28,7 @@ constexpr void test() { // __has_value on a non-empty cache { - Cache cache; cache.__set(T{}); + Cache cache; cache.__emplace(); assert(cache.__has_value()); } } From 5009be2f09ae25654753ee533dbfbc238aaf591c Mon Sep 17 00:00:00 2001 From: Kostya Kortchinsky Date: Mon, 16 Aug 2021 15:23:48 -0700 Subject: [PATCH 214/700] [scudo] Fix format string specifiers Enable `-Wformat` again, and fix the offending instances. Differential Revision: https://reviews.llvm.org/D108168 --- compiler-rt/lib/scudo/standalone/CMakeLists.txt | 3 --- compiler-rt/lib/scudo/standalone/primary64.h | 4 ++-- compiler-rt/lib/scudo/standalone/secondary.h | 11 +++++------ compiler-rt/lib/scudo/standalone/size_class_map.h | 4 ++-- compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt | 3 --- compiler-rt/lib/scudo/standalone/wrappers_c.inc | 2 +- 6 files changed, 10 insertions(+), 17 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/CMakeLists.txt index 17af724541071..8c3708d3a45f5 100644 --- a/compiler-rt/lib/scudo/standalone/CMakeLists.txt +++ b/compiler-rt/lib/scudo/standalone/CMakeLists.txt @@ -10,9 +10,6 @@ list(APPEND SCUDO_CFLAGS -g -nostdinc++) -# Too many existing bugs, needs cleanup. -append_list_if(COMPILER_RT_HAS_WNO_FORMAT -Wno-format SCUDO_CFLAGS) - # Remove -stdlib= which is unused when passing -nostdinc++. string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h index 13420bf3d2225..6c1785512c658 100644 --- a/compiler-rt/lib/scudo/standalone/primary64.h +++ b/compiler-rt/lib/scudo/standalone/primary64.h @@ -164,9 +164,9 @@ template class SizeClassAllocator64 { PoppedBlocks += Region->Stats.PoppedBlocks; PushedBlocks += Region->Stats.PushedBlocks; } - Str->append("Stats: SizeClassAllocator64: %zuM mapped (%zuM rss) in %zu " + Str->append("Stats: SizeClassAllocator64: %zuM mapped (%uM rss) in %zu " "allocations; remains %zu\n", - TotalMapped >> 20, 0, PoppedBlocks, + TotalMapped >> 20, 0U, PoppedBlocks, PoppedBlocks - PushedBlocks); for (uptr I = 0; I < NumClasses; I++) diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h index aa50fa98b1138..abb58a2882aff 100644 --- a/compiler-rt/lib/scudo/standalone/secondary.h +++ b/compiler-rt/lib/scudo/standalone/secondary.h @@ -602,12 +602,11 @@ void MapAllocator::deallocate(Options Options, void *Ptr) { template void MapAllocator::getStats(ScopedString *Str) const { - Str->append( - "Stats: MapAllocator: allocated %zu times (%zuK), freed %zu times " - "(%zuK), remains %zu (%zuK) max %zuM\n", - NumberOfAllocs, AllocatedBytes >> 10, NumberOfFrees, FreedBytes >> 10, - NumberOfAllocs - NumberOfFrees, (AllocatedBytes - FreedBytes) >> 10, - LargestSize >> 20); + Str->append("Stats: MapAllocator: allocated %u times (%zuK), freed %u times " + "(%zuK), remains %u (%zuK) max %zuM\n", + NumberOfAllocs, AllocatedBytes >> 10, NumberOfFrees, + FreedBytes >> 10, NumberOfAllocs - NumberOfFrees, + (AllocatedBytes - FreedBytes) >> 10, LargestSize >> 20); } } // namespace scudo diff --git a/compiler-rt/lib/scudo/standalone/size_class_map.h b/compiler-rt/lib/scudo/standalone/size_class_map.h index ba0f78453bcb8..28b16d976e5ea 100644 --- a/compiler-rt/lib/scudo/standalone/size_class_map.h +++ b/compiler-rt/lib/scudo/standalone/size_class_map.h @@ -335,8 +335,8 @@ template inline void printMap() { const uptr L = S ? getMostSignificantSetBitIndex(S) : 0; const uptr Cached = SCMap::getMaxCachedHint(S) * S; Buffer.append( - "C%02zu => S: %zu diff: +%zu %02zu%% L %zu Cached: %zu %zu; id %zu\n", - I, S, D, P, L, SCMap::getMaxCachedHint(S), Cached, + "C%02zu => S: %zu diff: +%zu %02zu%% L %zu Cached: %u %zu; id %zu\n", I, + S, D, P, L, SCMap::getMaxCachedHint(S), Cached, SCMap::getClassIdBySize(S)); TotalCached += Cached; PrevS = S; diff --git a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt index def0a20a4348f..f4186eba16881 100644 --- a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt +++ b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt @@ -17,9 +17,6 @@ set(SCUDO_UNITTEST_CFLAGS # TODO(kostyak): find a way to make -fsized-deallocation work -Wno-mismatched-new-delete) -# Too many existing bugs, needs cleanup. -append_list_if(COMPILER_RT_HAS_WNO_FORMAT -Wno-format SCUDO_UNITTEST_CFLAGS) - if(COMPILER_RT_DEBUG) list(APPEND SCUDO_UNITTEST_CFLAGS -DSCUDO_DEBUG=1) endif() diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.inc b/compiler-rt/lib/scudo/standalone/wrappers_c.inc index 43efb02cb8603..6c6bcb6783a7e 100644 --- a/compiler-rt/lib/scudo/standalone/wrappers_c.inc +++ b/compiler-rt/lib/scudo/standalone/wrappers_c.inc @@ -226,7 +226,7 @@ INTERFACE WEAK int SCUDO_PREFIX(malloc_info)(UNUSED int options, FILE *stream) { fputs("\n", stream); for (scudo::uptr i = 0; i != max_size; ++i) if (sizes[i]) - fprintf(stream, "\n", i, sizes[i]); + fprintf(stream, "\n", i, sizes[i]); fputs("\n", stream); SCUDO_PREFIX(free)(sizes); return 0; From 0f1e67fac24b7cb49c3e4feeeadf536d27d1473d Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Tue, 17 Aug 2021 10:34:59 -0500 Subject: [PATCH 215/700] [Polly] Fix possibly infinite loop. The loop had no side-effect since first committed in 642594ae87aca. While it is obvious what was intended, the code seems to never trigger. --- polly/lib/Analysis/ScopBuilder.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp index 4351d27f193ce..17907749ad71b 100644 --- a/polly/lib/Analysis/ScopBuilder.cpp +++ b/polly/lib/Analysis/ScopBuilder.cpp @@ -706,9 +706,11 @@ isl::set ScopBuilder::getPredecessorDomainConstraints(BasicBlock *BB, // Check if there is a valid region we can use for propagation, thus look // for a region that contains the predecessor and has @p BB as exit block. + // FIXME: This was an side-effect-free (and possibly infinite) loop when + // committed and seems not to be needed. auto *PredR = RI.getRegionFor(PredBB); while (PredR->getExit() != BB && !PredR->contains(BB)) - PredR->getParent(); + PredR = PredR->getParent(); // If a valid region for propagation was found use the entry of that region // for propagation, otherwise the PredBB directly. From 2078c4ecfda80f802febc4f98e4a163656093c43 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Tue, 17 Aug 2021 18:42:22 +0300 Subject: [PATCH 216/700] [X86] Lower insertions into upper half of an 256-bit vector as broadcast+blend (PR50971) Broadcast is not worse than extract+insert of subvector. https://godbolt.org/z/aPq98G6Yh Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D105390 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 26 +- llvm/test/CodeGen/X86/avx-insertelt.ll | 170 ++++++---- llvm/test/CodeGen/X86/avx2-masked-gather.ll | 61 ++-- .../test/CodeGen/X86/avx512-insert-extract.ll | 198 +++++++---- .../CodeGen/X86/avx512-masked-memop-64-32.ll | 19 +- .../test/CodeGen/X86/insertelement-shuffle.ll | 13 +- llvm/test/CodeGen/X86/masked_expandload.ll | 308 +++++++----------- llvm/test/CodeGen/X86/masked_gather.ll | 124 +++---- .../test/CodeGen/X86/masked_gather_scatter.ll | 98 +++--- llvm/test/CodeGen/X86/masked_load.ll | 245 +++++++------- 10 files changed, 640 insertions(+), 622 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 319c4eeb4ed9a..25f27a056d466 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19190,12 +19190,27 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, } } + unsigned NumEltsIn128 = 128 / EltSizeInBits; + assert(isPowerOf2_32(NumEltsIn128) && + "Vectors will always have power-of-two number of elements."); + + // If we are not inserting into the low 128-bit vector chunk, + // then prefer the broadcast+blend sequence. + // FIXME: relax the profitability check iff all N1 uses are insertions. + if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 && + ((Subtarget.hasAVX2() && EltSizeInBits != 8) || + (Subtarget.hasAVX() && (EltSizeInBits >= 32) && MayFoldLoad(N1)))) { + SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1); + SmallVector BlendMask; + for (unsigned i = 0; i != NumElts; ++i) + BlendMask.push_back(i == IdxVal ? i + NumElts : i); + return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask); + } + // Get the desired 128-bit vector chunk. SDValue V = extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired chunk. - unsigned NumEltsIn128 = 128 / EltSizeInBits; - assert(isPowerOf2_32(NumEltsIn128)); // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); @@ -37977,6 +37992,13 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); + // broadcast(extract_vector_elt(x, 0)) -> broadcast(x). + if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isNullConstant(Src.getOperand(1)) && + DAG.getTargetLoweringInfo().isTypeLegal( + Src.getOperand(0).getValueType())) + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); + // Share broadcast with the longest vector and extract low subvector (free). // Ensure the same SDValue from the SDNode use is being used. for (SDNode *User : Src->uses()) diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll index 3f5d004841e81..1bca2df5d9ce9 100644 --- a/llvm/test/CodeGen/X86/avx-insertelt.ll +++ b/llvm/test/CodeGen/X86/avx-insertelt.ll @@ -91,23 +91,35 @@ define <4 x i64> @insert_i64_firstelt_of_low_subvector(<4 x i64> %x, i64 %s) { ; 0'th element of high subvector insertion into an AVX register. define <8 x float> @insert_f32_firstelt_of_high_subvector(<8 x float> %x, float %s) { -; ALL-LABEL: insert_f32_firstelt_of_high_subvector: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; ALL-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX-LABEL: insert_f32_firstelt_of_high_subvector: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f32_firstelt_of_high_subvector: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-NEXT: retq %i0 = insertelement <8 x float> %x, float %s, i32 4 ret <8 x float> %i0 } define <4 x double> @insert_f64_firstelt_of_high_subvector(<4 x double> %x, double %s) { -; ALL-LABEL: insert_f64_firstelt_of_high_subvector: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; ALL-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX-LABEL: insert_f64_firstelt_of_high_subvector: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f64_firstelt_of_high_subvector: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq %i0 = insertelement <4 x double> %x, double %s, i32 2 ret <4 x double> %i0 } @@ -140,9 +152,10 @@ define <16 x i16> @insert_i16_firstelt_of_high_subvector(<16 x i16> %x, i16 %s) ; ; AVX2-LABEL: insert_i16_firstelt_of_high_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrw $0, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <16 x i16> %x, i16 %s, i32 8 ret <16 x i16> %i0 @@ -158,9 +171,9 @@ define <8 x i32> @insert_i32_firstelt_of_high_subvector(<8 x i32> %x, i32 %s) { ; ; AVX2-LABEL: insert_i32_firstelt_of_high_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrd $0, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <8 x i32> %x, i32 %s, i32 4 ret <8 x i32> %i0 @@ -176,9 +189,9 @@ define <4 x i64> @insert_i64_firstelt_of_high_subvector(<4 x i64> %x, i64 %s) { ; ; AVX2-LABEL: insert_i64_firstelt_of_high_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrq $0, %rdi, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq %i0 = insertelement <4 x i64> %x, i64 %s, i32 2 ret <4 x i64> %i0 @@ -187,26 +200,38 @@ define <4 x i64> @insert_i64_firstelt_of_high_subvector(<4 x i64> %x, i64 %s) { ; element insertion into 0'th element of both subvectors define <8 x float> @insert_f32_firstelts(<8 x float> %x, float %s) { -; ALL-LABEL: insert_f32_firstelts: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3] -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; ALL-NEXT: retq +; AVX-LABEL: insert_f32_firstelts: +; AVX: # %bb.0: +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f32_firstelts: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-NEXT: retq %i0 = insertelement <8 x float> %x, float %s, i32 0 %i1 = insertelement <8 x float> %i0, float %s, i32 4 ret <8 x float> %i1 } define <4 x double> @insert_f64_firstelts(<4 x double> %x, double %s) { -; ALL-LABEL: insert_f64_firstelts: -; ALL: # %bb.0: -; ALL-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; ALL-NEXT: retq +; AVX-LABEL: insert_f64_firstelts: +; AVX: # %bb.0: +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f64_firstelts: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-NEXT: retq %i0 = insertelement <4 x double> %x, double %s, i32 0 %i1 = insertelement <4 x double> %i0, double %s, i32 2 ret <4 x double> %i1 @@ -245,9 +270,11 @@ define <16 x i16> @insert_i16_firstelts(<16 x i16> %x, i16 %s) { ; AVX2-LABEL: insert_i16_firstelts: ; AVX2: # %bb.0: ; AVX2-NEXT: vpinsrw $0, %edi, %xmm0, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpinsrw $0, %edi, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <16 x i16> %x, i16 %s, i32 0 %i1 = insertelement <16 x i16> %i0, i16 %s, i32 8 @@ -266,10 +293,8 @@ define <8 x i32> @insert_i32_firstelts(<8 x i32> %x, i32 %s) { ; AVX2-LABEL: insert_i32_firstelts: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <8 x i32> %x, i32 %s, i32 0 %i1 = insertelement <8 x i32> %i0, i32 %s, i32 4 @@ -288,9 +313,10 @@ define <4 x i64> @insert_i64_firstelts(<4 x i64> %x, i64 %s) { ; AVX2-LABEL: insert_i64_firstelts: ; AVX2: # %bb.0: ; AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq %i0 = insertelement <4 x i64> %x, i64 %s, i32 0 %i1 = insertelement <4 x i64> %i0, i64 %s, i32 2 @@ -300,23 +326,35 @@ define <4 x i64> @insert_i64_firstelts(<4 x i64> %x, i64 %s) { ; element insertion into two elements of high subvector define <8 x float> @insert_f32_two_elts_of_high_subvector(<8 x float> %x, float %s) { -; ALL-LABEL: insert_f32_two_elts_of_high_subvector: -; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2 -; ALL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,3] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX-LABEL: insert_f32_two_elts_of_high_subvector: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f32_two_elts_of_high_subvector: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss %xmm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq %i0 = insertelement <8 x float> %x, float %s, i32 4 %i1 = insertelement <8 x float> %i0, float %s, i32 5 ret <8 x float> %i1 } define <4 x double> @insert_f64_two_elts_of_high_subvector(<4 x double> %x, double %s) { -; ALL-LABEL: insert_f64_two_elts_of_high_subvector: -; ALL: # %bb.0: -; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq +; AVX-LABEL: insert_f64_two_elts_of_high_subvector: +; AVX: # %bb.0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: insert_f64_two_elts_of_high_subvector: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq %i0 = insertelement <4 x double> %x, double %s, i32 2 %i1 = insertelement <4 x double> %i0, double %s, i32 3 ret <4 x double> %i1 @@ -354,10 +392,9 @@ define <16 x i16> @insert_i16_two_elts_of_high_subvector(<16 x i16> %x, i16 %s) ; ; AVX2-LABEL: insert_i16_two_elts_of_high_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrw $0, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $1, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <16 x i16> %x, i16 %s, i32 8 %i1 = insertelement <16 x i16> %i0, i16 %s, i32 9 @@ -375,10 +412,9 @@ define <8 x i32> @insert_i32_two_elts_of_high_subvector(<8 x i32> %x, i32 %s) { ; ; AVX2-LABEL: insert_i32_two_elts_of_high_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrd $0, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq %i0 = insertelement <8 x i32> %x, i32 %s, i32 4 %i1 = insertelement <8 x i32> %i0, i32 %s, i32 5 @@ -395,9 +431,9 @@ define <4 x i64> @insert_i64_two_elts_of_high_subvector(<4 x i64> %x, i64 %s) { ; ; AVX2-LABEL: insert_i64_two_elts_of_high_subvector: ; AVX2: # %bb.0: -; AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm1 -; AVX2-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: retq %i0 = insertelement <4 x i64> %x, i64 %s, i32 2 %i1 = insertelement <4 x i64> %i0, i64 %s, i32 3 diff --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll index 9b3635fa1c9e8..0eaa034bf32b2 100644 --- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll +++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll @@ -396,17 +396,15 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3 ; NOGATHER-NEXT: je .LBB6_10 ; NOGATHER-NEXT: # %bb.9: # %cond.load10 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; NOGATHER-NEXT: .LBB6_10: # %else11 ; NOGATHER-NEXT: testb $32, %al ; NOGATHER-NEXT: je .LBB6_12 ; NOGATHER-NEXT: # %bb.11: # %cond.load13 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vpinsrd $1, (%rcx), %xmm2, %xmm2 -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; NOGATHER-NEXT: .LBB6_12: # %else14 ; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: testb $64, %al @@ -419,16 +417,14 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3 ; NOGATHER-NEXT: retq ; NOGATHER-NEXT: .LBB6_13: # %cond.load16 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vpinsrd $2, (%rcx), %xmm2, %xmm2 -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; NOGATHER-NEXT: testb $-128, %al ; NOGATHER-NEXT: je .LBB6_16 ; NOGATHER-NEXT: .LBB6_15: # %cond.load19 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm0, %xmm0 -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rax), %ymm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: @@ -503,18 +499,15 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, < ; NOGATHER-NEXT: je .LBB7_10 ; NOGATHER-NEXT: # %bb.9: # %cond.load10 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; NOGATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; NOGATHER-NEXT: .LBB7_10: # %else11 ; NOGATHER-NEXT: testb $32, %al ; NOGATHER-NEXT: je .LBB7_12 ; NOGATHER-NEXT: # %bb.11: # %cond.load13 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; NOGATHER-NEXT: .LBB7_12: # %else14 ; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NOGATHER-NEXT: testb $64, %al @@ -527,16 +520,14 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, < ; NOGATHER-NEXT: retq ; NOGATHER-NEXT: .LBB7_13: # %cond.load16 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; NOGATHER-NEXT: testb $-128, %al ; NOGATHER-NEXT: je .LBB7_16 ; NOGATHER-NEXT: .LBB7_15: # %cond.load19 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastss (%rax), %ymm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: @@ -597,16 +588,14 @@ define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i6 ; NOGATHER-NEXT: retq ; NOGATHER-NEXT: .LBB8_5: # %cond.load4 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2 -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastsd (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; NOGATHER-NEXT: testb $8, %al ; NOGATHER-NEXT: je .LBB8_8 ; NOGATHER-NEXT: .LBB8_7: # %cond.load7 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastsd (%rax), %ymm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: @@ -667,16 +656,14 @@ define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks ; NOGATHER-NEXT: retq ; NOGATHER-NEXT: .LBB9_5: # %cond.load4 ; NOGATHER-NEXT: vmovq %xmm0, %rcx -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm2 -; NOGATHER-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; NOGATHER-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastsd (%rcx), %ymm2 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; NOGATHER-NEXT: testb $8, %al ; NOGATHER-NEXT: je .LBB9_8 ; NOGATHER-NEXT: .LBB9_7: # %cond.load7 ; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax -; NOGATHER-NEXT: vextractf128 $1, %ymm1, %xmm0 -; NOGATHER-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; NOGATHER-NEXT: vbroadcastsd (%rax), %ymm0 +; NOGATHER-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; NOGATHER-NEXT: vmovaps %ymm1, %ymm0 ; NOGATHER-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index 7274d8335129c..4ae0d273daaee 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -8,9 +8,9 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { ; CHECK: ## %bb.0: ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] ; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 -; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 +; CHECK-NEXT: vbroadcastss %xmm1, %zmm1 +; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,15] +; CHECK-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: retq %rrr = load float, float* %br %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1 @@ -19,14 +19,23 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { } define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { -; CHECK-LABEL: test2: -; CHECK: ## %bb.0: -; CHECK-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1] -; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 -; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: test2: +; KNL: ## %bb.0: +; KNL-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0] +; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0 +; KNL-NEXT: movb $64, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: test2: +; SKX: ## %bb.0: +; SKX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0] +; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0 +; SKX-NEXT: movb $64, %al +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} +; SKX-NEXT: retq %rrr = load double, double* %br %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1 %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6 @@ -535,14 +544,23 @@ define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) { } define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) { -; CHECK-LABEL: insert_v8i64: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v8i64: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL-NEXT: movb $8, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v8i64: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; SKX-NEXT: movb $8, %al +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} +; SKX-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <8 x i64> %x, i64 %val, i32 1 %r2 = insertelement <8 x i64> %r1, i64 %y, i32 3 @@ -550,13 +568,22 @@ define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) { } define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) { -; CHECK-LABEL: insert_v4i64: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v4i64: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vmovq %rdi, %xmm1 +; KNL-NEXT: vpbroadcastq %xmm1, %ymm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v4i64: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SKX-NEXT: vpbroadcastq %rdi, %ymm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; SKX-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <4 x i64> %x, i64 %val, i32 1 %r2 = insertelement <4 x i64> %r1, i64 %y, i32 3 @@ -576,14 +603,23 @@ define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) { } define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) { -; CHECK-LABEL: insert_v16i32: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v16i32: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL-NEXT: movw $32, %ax +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpbroadcastd %edi, %zmm0 {%k1} +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v16i32: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; SKX-NEXT: movw $32, %ax +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vpbroadcastd %edi, %zmm0 {%k1} +; SKX-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <16 x i32> %x, i32 %val, i32 1 %r2 = insertelement <16 x i32> %r1, i32 %y, i32 5 @@ -591,13 +627,22 @@ define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) { } define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) { -; CHECK-LABEL: insert_v8i32: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v8i32: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpbroadcastd %xmm1, %ymm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v8i32: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SKX-NEXT: vpbroadcastd %edi, %ymm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; SKX-NEXT: retq %val = load i32, i32* %ptr %r1 = insertelement <8 x i32> %x, i32 %val, i32 1 %r2 = insertelement <8 x i32> %r1, i32 %y, i32 5 @@ -617,14 +662,24 @@ define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) { } define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) { -; CHECK-LABEL: insert_v32i16: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v32i16: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 +; KNL-NEXT: vmovd %edi, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; KNL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v32i16: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; SKX-NEXT: movl $512, %eax ## imm = 0x200 +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vpbroadcastw %edi, %zmm0 {%k1} +; SKX-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <32 x i16> %x, i16 %val, i32 1 %r2 = insertelement <32 x i16> %r1, i16 %y, i32 9 @@ -632,13 +687,24 @@ define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) { } define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) { -; CHECK-LABEL: insert_v16i16: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: insert_v16i16: +; KNL: ## %bb.0: +; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpbroadcastw %xmm1, %ymm1 +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: insert_v16i16: +; SKX: ## %bb.0: +; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SKX-NEXT: vpbroadcastw %edi, %ymm1 +; SKX-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; SKX-NEXT: retq %val = load i16, i16* %ptr %r1 = insertelement <16 x i16> %x, i16 %val, i32 1 %r2 = insertelement <16 x i16> %r1, i16 %y, i32 9 @@ -739,12 +805,20 @@ define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) { } define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) { -; CHECK-LABEL: test_insert_128_v16i16: -; CHECK: ## %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1 -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: retq +; KNL-LABEL: test_insert_128_v16i16: +; KNL: ## %bb.0: +; KNL-NEXT: vmovd %edi, %xmm1 +; KNL-NEXT: vpbroadcastw %xmm1, %ymm1 +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: retq +; +; SKX-LABEL: test_insert_128_v16i16: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastw %edi, %ymm1 +; SKX-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] +; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; SKX-NEXT: retq %r = insertelement <16 x i16> %x, i16 %y, i32 10 ret <16 x i16> %r } diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll index 35374c880b721..62c18b8b26380 100644 --- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll +++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll @@ -72,12 +72,19 @@ define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { ; -; AVX512-LABEL: load_one_mask_bit_set5: -; AVX512: ## %bb.0: -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_one_mask_bit_set5: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: movb $-128, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} +; AVX512F-NEXT: retq +; +; SKX-LABEL: load_one_mask_bit_set5: +; SKX: ## %bb.0: +; SKX-NEXT: movb $-128, %al +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} +; SKX-NEXT: retq %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>, <8 x double> %val) ret <8 x double> %res } diff --git a/llvm/test/CodeGen/X86/insertelement-shuffle.ll b/llvm/test/CodeGen/X86/insertelement-shuffle.ll index 000466f598bb0..57ab9344c4fd8 100644 --- a/llvm/test/CodeGen/X86/insertelement-shuffle.ll +++ b/llvm/test/CodeGen/X86/insertelement-shuffle.ll @@ -30,19 +30,18 @@ define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounw define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind { ; X86_AVX256-LABEL: insert_subvector_512: ; X86_AVX256: # %bb.0: -; X86_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2 -; X86_AVX256-NEXT: vpinsrd $0, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; X86_AVX256-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; X86_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X86_AVX256-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 +; X86_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] +; X86_AVX256-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 +; X86_AVX256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; X86_AVX256-NEXT: retl ; ; X64_AVX256-LABEL: insert_subvector_512: ; X64_AVX256: # %bb.0: ; X64_AVX256-NEXT: vmovd %edi, %xmm2 ; X64_AVX256-NEXT: vpinsrd $1, %esi, %xmm2, %xmm2 -; X64_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm3 -; X64_AVX256-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; X64_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; X64_AVX256-NEXT: vpbroadcastq %xmm2, %ymm2 +; X64_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; X64_AVX256-NEXT: retq ; ; X86_AVX512-LABEL: insert_subvector_512: diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll index dc6362d499a1d..552b69748e86c 100644 --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -216,16 +216,14 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, < ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je LBB1_6 ; AVX1-NEXT: LBB1_5: ## %cond.load5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je LBB1_8 ; AVX1-NEXT: LBB1_7: ## %cond.load9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovhps (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: expandload_v4f64_v4i64: @@ -259,16 +257,14 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, < ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB1_6 ; AVX2-NEXT: LBB1_5: ## %cond.load5 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB1_8 ; AVX2-NEXT: LBB1_7: ## %cond.load9 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovhpd (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v4f64_v4i64: @@ -405,16 +401,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je LBB2_6 ; AVX1-NEXT: LBB2_5: ## %cond.load5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je LBB2_8 ; AVX1-NEXT: LBB2_7: ## %cond.load9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je LBB2_10 @@ -431,16 +425,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je LBB2_14 ; AVX1-NEXT: LBB2_13: ## %cond.load21 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je LBB2_16 ; AVX1-NEXT: LBB2_15: ## %cond.load25 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: expandload_v8f64_v8i1: @@ -486,16 +478,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB2_6 ; AVX2-NEXT: LBB2_5: ## %cond.load5 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB2_8 ; AVX2-NEXT: LBB2_7: ## %cond.load9 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je LBB2_10 @@ -512,16 +502,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je LBB2_14 ; AVX2-NEXT: LBB2_13: ## %cond.load21 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je LBB2_16 ; AVX2-NEXT: LBB2_15: ## %cond.load25 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v8f64_v8i1: @@ -777,16 +765,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: testb $4, %al ; AVX1-NEXT: je LBB3_6 ; AVX1-NEXT: LBB3_5: ## %cond.load5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $8, %al ; AVX1-NEXT: je LBB3_8 ; AVX1-NEXT: LBB3_7: ## %cond.load9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je LBB3_10 @@ -803,16 +789,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je LBB3_14 ; AVX1-NEXT: LBB3_13: ## %cond.load21 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je LBB3_16 ; AVX1-NEXT: LBB3_15: ## %cond.load25 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testl $256, %eax ## imm = 0x100 ; AVX1-NEXT: je LBB3_18 @@ -829,16 +813,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX1-NEXT: je LBB3_22 ; AVX1-NEXT: LBB3_21: ## %cond.load37 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX1-NEXT: je LBB3_24 ; AVX1-NEXT: LBB3_23: ## %cond.load41 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX1-NEXT: je LBB3_26 @@ -855,16 +837,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX1-NEXT: je LBB3_30 ; AVX1-NEXT: LBB3_29: ## %cond.load53 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX1-NEXT: addq $8, %rdi ; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX1-NEXT: je LBB3_32 ; AVX1-NEXT: LBB3_31: ## %cond.load57 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: expandload_v16f64_v16i32: @@ -939,16 +919,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $4, %al ; AVX2-NEXT: je LBB3_6 ; AVX2-NEXT: LBB3_5: ## %cond.load5 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $8, %al ; AVX2-NEXT: je LBB3_8 ; AVX2-NEXT: LBB3_7: ## %cond.load9 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je LBB3_10 @@ -965,16 +943,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je LBB3_14 ; AVX2-NEXT: LBB3_13: ## %cond.load21 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je LBB3_16 ; AVX2-NEXT: LBB3_15: ## %cond.load25 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: je LBB3_18 @@ -991,16 +967,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX2-NEXT: je LBB3_22 ; AVX2-NEXT: LBB3_21: ## %cond.load37 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX2-NEXT: je LBB3_24 ; AVX2-NEXT: LBB3_23: ## %cond.load41 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: je LBB3_26 @@ -1017,16 +991,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX2-NEXT: je LBB3_30 ; AVX2-NEXT: LBB3_29: ## %cond.load53 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX2-NEXT: addq $8, %rdi ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX2-NEXT: je LBB3_32 ; AVX2-NEXT: LBB3_31: ## %cond.load57 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: expandload_v16f64_v16i32: @@ -2193,31 +2165,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0, ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je LBB8_10 ; AVX1-NEXT: LBB8_9: ## %cond.load13 -; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je LBB8_12 ; AVX1-NEXT: LBB8_11: ## %cond.load17 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je LBB8_14 ; AVX1-NEXT: LBB8_13: ## %cond.load21 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6],ymm0[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je LBB8_16 ; AVX1-NEXT: LBB8_15: ## %cond.load25 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $256, %eax ## imm = 0x100 ; AVX1-NEXT: je LBB8_18 @@ -2246,31 +2213,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0, ; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX1-NEXT: je LBB8_26 ; AVX1-NEXT: LBB8_25: ## %cond.load45 -; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX1-NEXT: je LBB8_28 ; AVX1-NEXT: LBB8_27: ## %cond.load49 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX1-NEXT: je LBB8_30 ; AVX1-NEXT: LBB8_29: ## %cond.load53 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX1-NEXT: je LBB8_32 ; AVX1-NEXT: LBB8_31: ## %cond.load57 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX1-NEXT: je LBB8_34 @@ -2299,31 +2261,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0, ; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX1-NEXT: je LBB8_42 ; AVX1-NEXT: LBB8_41: ## %cond.load77 -; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX1-NEXT: je LBB8_44 ; AVX1-NEXT: LBB8_43: ## %cond.load81 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX1-NEXT: je LBB8_46 ; AVX1-NEXT: LBB8_45: ## %cond.load85 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX1-NEXT: je LBB8_48 ; AVX1-NEXT: LBB8_47: ## %cond.load89 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX1-NEXT: je LBB8_50 @@ -2352,31 +2309,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0, ; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX1-NEXT: je LBB8_58 ; AVX1-NEXT: LBB8_57: ## %cond.load109 -; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX1-NEXT: je LBB8_60 ; AVX1-NEXT: LBB8_59: ## %cond.load113 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX1-NEXT: je LBB8_62 ; AVX1-NEXT: LBB8_61: ## %cond.load117 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX1-NEXT: addq $4, %rdi ; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX1-NEXT: je LBB8_64 ; AVX1-NEXT: LBB8_63: ## %cond.load121 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: expandload_v32f32_v32i32: @@ -2515,31 +2467,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0, ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je LBB8_10 ; AVX2-NEXT: LBB8_9: ## %cond.load13 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je LBB8_12 ; AVX2-NEXT: LBB8_11: ## %cond.load17 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je LBB8_14 ; AVX2-NEXT: LBB8_13: ## %cond.load21 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6],ymm0[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je LBB8_16 ; AVX2-NEXT: LBB8_15: ## %cond.load25 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: je LBB8_18 @@ -2568,31 +2515,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0, ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: je LBB8_26 ; AVX2-NEXT: LBB8_25: ## %cond.load45 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX2-NEXT: je LBB8_28 ; AVX2-NEXT: LBB8_27: ## %cond.load49 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX2-NEXT: je LBB8_30 ; AVX2-NEXT: LBB8_29: ## %cond.load53 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX2-NEXT: je LBB8_32 ; AVX2-NEXT: LBB8_31: ## %cond.load57 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000 ; AVX2-NEXT: je LBB8_34 @@ -2621,31 +2563,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0, ; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000 ; AVX2-NEXT: je LBB8_42 ; AVX2-NEXT: LBB8_41: ## %cond.load77 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $2097152, %eax ## imm = 0x200000 ; AVX2-NEXT: je LBB8_44 ; AVX2-NEXT: LBB8_43: ## %cond.load81 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000 ; AVX2-NEXT: je LBB8_46 ; AVX2-NEXT: LBB8_45: ## %cond.load85 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000 ; AVX2-NEXT: je LBB8_48 ; AVX2-NEXT: LBB8_47: ## %cond.load89 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000 ; AVX2-NEXT: je LBB8_50 @@ -2674,31 +2611,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0, ; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000 ; AVX2-NEXT: je LBB8_58 ; AVX2-NEXT: LBB8_57: ## %cond.load109 -; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $536870912, %eax ## imm = 0x20000000 ; AVX2-NEXT: je LBB8_60 ; AVX2-NEXT: LBB8_59: ## %cond.load113 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; AVX2-NEXT: je LBB8_62 ; AVX2-NEXT: LBB8_61: ## %cond.load117 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX2-NEXT: addq $4, %rdi ; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 ; AVX2-NEXT: je LBB8_64 ; AVX2-NEXT: LBB8_63: ## %cond.load121 -; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0] -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm4 +; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-NEXT: retq ; ; AVX512-LABEL: expandload_v32f32_v32i32: diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll index a1ff2bf087827..2f00b80bb76bb 100644 --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -1359,11 +1359,10 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB4_16 ; AVX1-NEXT: .LBB4_15: # %cond.load19 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrd $3, c+12(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX1-NEXT: .LBB4_16: # %else20 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 @@ -1393,11 +1392,10 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB4_32 ; AVX1-NEXT: .LBB4_31: # %cond.load58 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX1-NEXT: .LBB4_32: # %else61 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1418,9 +1416,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB4_42 ; AVX1-NEXT: .LBB4_41: # %cond.load84 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpinsrd $0, c+28(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] ; AVX1-NEXT: .LBB4_42: # %else87 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 @@ -1428,25 +1425,22 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB4_44 ; AVX1-NEXT: # %bb.43: # %cond.load89 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX1-NEXT: .LBB4_44: # %else92 ; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB4_46 ; AVX1-NEXT: # %bb.45: # %cond.load94 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7] ; AVX1-NEXT: .LBB4_46: # %else97 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: je .LBB4_48 ; AVX1-NEXT: # %bb.47: # %cond.load99 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX1-NEXT: .LBB4_48: # %else102 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 @@ -1474,21 +1468,18 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB4_10 ; AVX1-NEXT: .LBB4_9: # %cond.load10 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrd $0, c+12(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6,7] ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB4_12 ; AVX1-NEXT: .LBB4_11: # %cond.load13 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrd $1, c+12(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB4_14 ; AVX1-NEXT: .LBB4_13: # %cond.load16 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpinsrd $2, c+12(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7] ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: jne .LBB4_15 ; AVX1-NEXT: jmp .LBB4_16 @@ -1512,21 +1503,18 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX1-NEXT: testb $16, %al ; AVX1-NEXT: je .LBB4_26 ; AVX1-NEXT: .LBB4_25: # %cond.load43 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpinsrd $0, c+28(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] ; AVX1-NEXT: testb $32, %al ; AVX1-NEXT: je .LBB4_28 ; AVX1-NEXT: .LBB4_27: # %cond.load48 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX1-NEXT: testb $64, %al ; AVX1-NEXT: je .LBB4_30 ; AVX1-NEXT: .LBB4_29: # %cond.load53 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX1-NEXT: testb $-128, %al ; AVX1-NEXT: jne .LBB4_31 ; AVX1-NEXT: jmp .LBB4_32 @@ -1581,9 +1569,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB4_16 ; AVX2-NEXT: .LBB4_15: # %cond.load19 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrd $3, c+12(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: .LBB4_16: # %else20 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2 @@ -1613,9 +1600,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB4_32 ; AVX2-NEXT: .LBB4_31: # %cond.load58 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpinsrd $3, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-NEXT: .LBB4_32: # %else61 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm0, %ymm0 @@ -1642,17 +1628,15 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB4_46 ; AVX2-NEXT: .LBB4_45: # %cond.load94 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7] ; AVX2-NEXT: .LBB4_46: # %else97 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: je .LBB4_48 ; AVX2-NEXT: # %bb.47: # %cond.load99 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpinsrd $3, c+28(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-NEXT: .LBB4_48: # %else102 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -1676,21 +1660,18 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB4_10 ; AVX2-NEXT: .LBB4_9: # %cond.load10 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrd $0, c+12(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB4_12 ; AVX2-NEXT: .LBB4_11: # %cond.load13 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrd $1, c+12(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB4_14 ; AVX2-NEXT: .LBB4_13: # %cond.load16 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpinsrd $2, c+12(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: jne .LBB4_15 ; AVX2-NEXT: jmp .LBB4_16 @@ -1714,21 +1695,18 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB4_26 ; AVX2-NEXT: .LBB4_25: # %cond.load43 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpinsrd $0, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB4_28 ; AVX2-NEXT: .LBB4_27: # %cond.load48 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: je .LBB4_30 ; AVX2-NEXT: .LBB4_29: # %cond.load53 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] ; AVX2-NEXT: testb $-128, %al ; AVX2-NEXT: jne .LBB4_31 ; AVX2-NEXT: jmp .LBB4_32 @@ -1752,15 +1730,13 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; AVX2-NEXT: testb $16, %al ; AVX2-NEXT: je .LBB4_42 ; AVX2-NEXT: .LBB4_41: # %cond.load84 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpinsrd $0, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7] ; AVX2-NEXT: testb $32, %al ; AVX2-NEXT: je .LBB4_44 ; AVX2-NEXT: .LBB4_43: # %cond.load89 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-NEXT: testb $64, %al ; AVX2-NEXT: jne .LBB4_45 ; AVX2-NEXT: jmp .LBB4_46 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 771850f1f2ecc..2a961299d1770 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -965,16 +965,14 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; KNL_64-NEXT: retq ; KNL_64-NEXT: .LBB15_5: # %cond.load4 ; KNL_64-NEXT: vmovq %xmm0, %rcx -; KNL_64-NEXT: vextracti128 $1, %ymm2, %xmm1 -; KNL_64-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; KNL_64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm1 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; KNL_64-NEXT: testb $8, %al ; KNL_64-NEXT: je .LBB15_8 ; KNL_64-NEXT: .LBB15_7: # %cond.load7 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax -; KNL_64-NEXT: vextracti128 $1, %ymm2, %xmm0 -; KNL_64-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; KNL_64-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 +; KNL_64-NEXT: vpbroadcastq (%rax), %ymm0 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0 ; KNL_64-NEXT: retq ; @@ -1014,16 +1012,14 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; KNL_32-NEXT: je .LBB15_6 ; KNL_32-NEXT: .LBB15_5: # %cond.load4 ; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx -; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm1 -; KNL_32-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; KNL_32-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; KNL_32-NEXT: vpbroadcastq (%ecx), %ymm1 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; KNL_32-NEXT: testb $8, %al ; KNL_32-NEXT: je .LBB15_8 ; KNL_32-NEXT: .LBB15_7: # %cond.load7 ; KNL_32-NEXT: vpextrd $3, %xmm0, %eax -; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm0 -; KNL_32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; KNL_32-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 +; KNL_32-NEXT: vpbroadcastq (%eax), %ymm0 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; KNL_32-NEXT: vmovdqa %ymm2, %ymm0 ; KNL_32-NEXT: retl ; @@ -3220,17 +3216,15 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_64-NEXT: je .LBB42_6 ; KNL_64-NEXT: # %bb.5: # %cond.load4 ; KNL_64-NEXT: vmovq %xmm2, %rcx -; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm3 -; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm3, %xmm3 -; KNL_64-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm3 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] ; KNL_64-NEXT: .LBB42_6: # %else5 ; KNL_64-NEXT: testb $8, %al ; KNL_64-NEXT: je .LBB42_8 ; KNL_64-NEXT: # %bb.7: # %cond.load7 ; KNL_64-NEXT: vpextrq $1, %xmm2, %rax -; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm3 -; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm3, %xmm3 -; KNL_64-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; KNL_64-NEXT: vpbroadcastq (%rax), %ymm3 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; KNL_64-NEXT: .LBB42_8: # %else8 ; KNL_64-NEXT: kmovw %k0, %eax ; KNL_64-NEXT: testb $1, %al @@ -3247,9 +3241,8 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_64-NEXT: je .LBB42_16 ; KNL_64-NEXT: .LBB42_15: # %cond.load29 ; KNL_64-NEXT: vpextrq $1, %xmm2, %rax -; KNL_64-NEXT: vextracti128 $1, %ymm3, %xmm4 -; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm4, %xmm4 -; KNL_64-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; KNL_64-NEXT: vpbroadcastq (%rax), %ymm4 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; KNL_64-NEXT: .LBB42_16: # %else33 ; KNL_64-NEXT: kmovw %k0, %eax ; KNL_64-NEXT: testb $1, %al @@ -3266,9 +3259,8 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_64-NEXT: je .LBB42_24 ; KNL_64-NEXT: .LBB42_23: # %cond.load54 ; KNL_64-NEXT: vpextrq $1, %xmm2, %rax -; KNL_64-NEXT: vextracti128 $1, %ymm4, %xmm0 -; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 -; KNL_64-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm4 +; KNL_64-NEXT: vpbroadcastq (%rax), %ymm0 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7] ; KNL_64-NEXT: .LBB42_24: # %else58 ; KNL_64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 ; KNL_64-NEXT: vpaddq %ymm4, %ymm0, %ymm0 @@ -3286,9 +3278,8 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_64-NEXT: je .LBB42_14 ; KNL_64-NEXT: .LBB42_13: # %cond.load23 ; KNL_64-NEXT: vmovq %xmm2, %rcx -; KNL_64-NEXT: vextracti128 $1, %ymm3, %xmm4 -; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm4, %xmm4 -; KNL_64-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm4 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; KNL_64-NEXT: testb $8, %al ; KNL_64-NEXT: jne .LBB42_15 ; KNL_64-NEXT: jmp .LBB42_16 @@ -3305,9 +3296,8 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_64-NEXT: je .LBB42_22 ; KNL_64-NEXT: .LBB42_21: # %cond.load48 ; KNL_64-NEXT: vmovq %xmm2, %rcx -; KNL_64-NEXT: vextracti128 $1, %ymm4, %xmm0 -; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm0, %xmm0 -; KNL_64-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm4 +; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm0 +; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] ; KNL_64-NEXT: testb $8, %al ; KNL_64-NEXT: jne .LBB42_23 ; KNL_64-NEXT: jmp .LBB42_24 @@ -3347,19 +3337,19 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_32-NEXT: vpextrd $2, %xmm0, %edx ; KNL_32-NEXT: je .LBB42_6 ; KNL_32-NEXT: # %bb.5: # %cond.load4 -; KNL_32-NEXT: vextracti128 $1, %ymm1, %xmm2 -; KNL_32-NEXT: vpinsrd $0, (%edx), %xmm2, %xmm2 -; KNL_32-NEXT: vpinsrd $1, 4(%edx), %xmm2, %xmm2 -; KNL_32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; KNL_32-NEXT: .LBB42_6: # %else5 ; KNL_32-NEXT: testb $8, %bl ; KNL_32-NEXT: vpextrd $3, %xmm0, %esi ; KNL_32-NEXT: je .LBB42_8 ; KNL_32-NEXT: # %bb.7: # %cond.load7 -; KNL_32-NEXT: vextracti128 $1, %ymm1, %xmm0 -; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0 -; KNL_32-NEXT: vpinsrd $3, 4(%esi), %xmm0, %xmm0 -; KNL_32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; KNL_32-NEXT: vpbroadcastd (%esi), %ymm0 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm1 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; KNL_32-NEXT: .LBB42_8: # %else8 ; KNL_32-NEXT: kmovw %k0, %ebx ; KNL_32-NEXT: testb $1, %bl @@ -3375,10 +3365,10 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_32-NEXT: testb $8, %bl ; KNL_32-NEXT: je .LBB42_16 ; KNL_32-NEXT: .LBB42_15: # %cond.load29 -; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm2 -; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm2, %xmm2 -; KNL_32-NEXT: vpinsrd $3, 4(%esi), %xmm2, %xmm2 -; KNL_32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; KNL_32-NEXT: vpbroadcastd (%esi), %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] +; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; KNL_32-NEXT: .LBB42_16: # %else33 ; KNL_32-NEXT: kmovw %k0, %ebx ; KNL_32-NEXT: testb $1, %bl @@ -3394,10 +3384,10 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_32-NEXT: testb $8, %bl ; KNL_32-NEXT: je .LBB42_24 ; KNL_32-NEXT: .LBB42_23: # %cond.load54 -; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm3 -; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm3, %xmm3 -; KNL_32-NEXT: vpinsrd $3, 4(%esi), %xmm3, %xmm3 -; KNL_32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; KNL_32-NEXT: vpbroadcastd (%esi), %ymm3 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm3 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; KNL_32-NEXT: .LBB42_24: # %else58 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; KNL_32-NEXT: vpaddq %ymm2, %ymm0, %ymm0 @@ -3419,10 +3409,10 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_32-NEXT: testb $4, %bl ; KNL_32-NEXT: je .LBB42_14 ; KNL_32-NEXT: .LBB42_13: # %cond.load23 -; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm2 -; KNL_32-NEXT: vpinsrd $0, (%edx), %xmm2, %xmm2 -; KNL_32-NEXT: vpinsrd $1, 4(%edx), %xmm2, %xmm2 -; KNL_32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] +; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; KNL_32-NEXT: testb $8, %bl ; KNL_32-NEXT: jne .LBB42_15 ; KNL_32-NEXT: jmp .LBB42_16 @@ -3437,10 +3427,10 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_32-NEXT: testb $4, %bl ; KNL_32-NEXT: je .LBB42_22 ; KNL_32-NEXT: .LBB42_21: # %cond.load48 -; KNL_32-NEXT: vextracti128 $1, %ymm2, %xmm3 -; KNL_32-NEXT: vpinsrd $0, (%edx), %xmm3, %xmm3 -; KNL_32-NEXT: vpinsrd $1, 4(%edx), %xmm3, %xmm3 -; KNL_32-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; KNL_32-NEXT: vpbroadcastd (%edx), %ymm3 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm3 +; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; KNL_32-NEXT: testb $8, %bl ; KNL_32-NEXT: jne .LBB42_23 ; KNL_32-NEXT: jmp .LBB42_24 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 2e55a372ab576..e8e45a1567362 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -3463,51 +3463,51 @@ define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <1 ; AVX2-NEXT: testl $256, %eax ## imm = 0x100 ; AVX2-NEXT: je LBB22_18 ; AVX2-NEXT: LBB22_17: ## %cond.load22 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 16(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $512, %eax ## imm = 0x200 ; AVX2-NEXT: je LBB22_20 ; AVX2-NEXT: LBB22_19: ## %cond.load25 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 18(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX2-NEXT: je LBB22_22 ; AVX2-NEXT: LBB22_21: ## %cond.load28 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 20(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX2-NEXT: je LBB22_24 ; AVX2-NEXT: LBB22_23: ## %cond.load31 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 22(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX2-NEXT: je LBB22_26 ; AVX2-NEXT: LBB22_25: ## %cond.load34 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 24(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX2-NEXT: je LBB22_28 ; AVX2-NEXT: LBB22_27: ## %cond.load37 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 26(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX2-NEXT: je LBB22_30 ; AVX2-NEXT: LBB22_29: ## %cond.load40 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 28(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX2-NEXT: je LBB22_32 ; AVX2-NEXT: LBB22_31: ## %cond.load43 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw 30(%rdi), %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3609,51 +3609,51 @@ define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <1 ; AVX512F-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512F-NEXT: je LBB22_18 ; AVX512F-NEXT: LBB22_17: ## %cond.load22 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 16(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512F-NEXT: je LBB22_20 ; AVX512F-NEXT: LBB22_19: ## %cond.load25 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 18(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512F-NEXT: je LBB22_22 ; AVX512F-NEXT: LBB22_21: ## %cond.load28 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 20(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512F-NEXT: je LBB22_24 ; AVX512F-NEXT: LBB22_23: ## %cond.load31 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 22(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512F-NEXT: je LBB22_26 ; AVX512F-NEXT: LBB22_25: ## %cond.load34 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 24(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512F-NEXT: je LBB22_28 ; AVX512F-NEXT: LBB22_27: ## %cond.load37 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 26(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512F-NEXT: je LBB22_30 ; AVX512F-NEXT: LBB22_29: ## %cond.load40 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 28(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512F-NEXT: je LBB22_32 ; AVX512F-NEXT: LBB22_31: ## %cond.load43 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw 30(%rdi), %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -3755,51 +3755,51 @@ define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <1 ; AVX512VLDQ-NEXT: testl $256, %eax ## imm = 0x100 ; AVX512VLDQ-NEXT: je LBB22_18 ; AVX512VLDQ-NEXT: LBB22_17: ## %cond.load22 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 16(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $512, %eax ## imm = 0x200 ; AVX512VLDQ-NEXT: je LBB22_20 ; AVX512VLDQ-NEXT: LBB22_19: ## %cond.load25 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $1, 18(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 18(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $1024, %eax ## imm = 0x400 ; AVX512VLDQ-NEXT: je LBB22_22 ; AVX512VLDQ-NEXT: LBB22_21: ## %cond.load28 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $2, 20(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 20(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $2048, %eax ## imm = 0x800 ; AVX512VLDQ-NEXT: je LBB22_24 ; AVX512VLDQ-NEXT: LBB22_23: ## %cond.load31 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $3, 22(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 22(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $4096, %eax ## imm = 0x1000 ; AVX512VLDQ-NEXT: je LBB22_26 ; AVX512VLDQ-NEXT: LBB22_25: ## %cond.load34 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 24(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $8192, %eax ## imm = 0x2000 ; AVX512VLDQ-NEXT: je LBB22_28 ; AVX512VLDQ-NEXT: LBB22_27: ## %cond.load37 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $5, 26(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 26(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $16384, %eax ## imm = 0x4000 ; AVX512VLDQ-NEXT: je LBB22_30 ; AVX512VLDQ-NEXT: LBB22_29: ## %cond.load40 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 28(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: testl $32768, %eax ## imm = 0x8000 ; AVX512VLDQ-NEXT: je LBB22_32 ; AVX512VLDQ-NEXT: LBB22_31: ## %cond.load43 -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLDQ-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpbroadcastw 30(%rdi), %ymm0 +; AVX512VLDQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15] +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLDQ-NEXT: retq ; @@ -7084,33 +7084,17 @@ define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) { ; SSE42-NEXT: pinsrq $0, 16(%rdi), %xmm1 ; SSE42-NEXT: retq ; -; AVX1-LABEL: load_one_mask_bit_set3: -; AVX1: ## %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_one_mask_bit_set3: -; AVX2: ## %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_one_mask_bit_set3: -; AVX512: ## %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_one_mask_bit_set3: +; AVX: ## %bb.0: +; AVX-NEXT: vbroadcastsd 16(%rdi), %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX-NEXT: retq ; ; X86-AVX512-LABEL: load_one_mask_bit_set3: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX512-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX512-NEXT: vbroadcastsd 16(%eax), %ymm1 +; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; X86-AVX512-NEXT: retl %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> , <4 x i64> %val) ret <4 x i64> %res @@ -7126,17 +7110,15 @@ define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %v ; ; AVX-LABEL: load_one_mask_bit_set4: ; AVX: ## %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vbroadcastsd 24(%rdi), %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: retq ; ; X86-AVX512-LABEL: load_one_mask_bit_set4: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX512-NEXT: vbroadcastsd 24(%eax), %ymm1 +; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; X86-AVX512-NEXT: retl %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> , <4 x double> %val) ret <4 x double> %res @@ -7152,24 +7134,37 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %v ; ; AVX1OR2-LABEL: load_one_mask_bit_set5: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1OR2-NEXT: vbroadcastsd 56(%rdi), %ymm2 +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1OR2-NEXT: retq ; -; AVX512-LABEL: load_one_mask_bit_set5: -; AVX512: ## %bb.0: -; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_one_mask_bit_set5: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: movb $-128, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} +; AVX512F-NEXT: retq +; +; AVX512VLDQ-LABEL: load_one_mask_bit_set5: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $-128, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: load_one_mask_bit_set5: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: movb $-128, %al +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vbroadcastsd 56(%rdi), %zmm0 {%k1} +; AVX512VLBW-NEXT: retq ; ; X86-AVX512-LABEL: load_one_mask_bit_set5: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 -; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; X86-AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 +; X86-AVX512-NEXT: movb $-128, %cl +; X86-AVX512-NEXT: kmovd %ecx, %k1 +; X86-AVX512-NEXT: vbroadcastsd 56(%eax), %zmm0 {%k1} ; X86-AVX512-NEXT: retl %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> , <8 x double> %val) ret <8 x double> %res @@ -7235,43 +7230,43 @@ define <16 x i64> @load_one_mask_bit_set6(<16 x i64>* %addr, <16 x i64> %val) { ; ; AVX512F-LABEL: load_one_mask_bit_set6: ; AVX512F: ## %bb.0: +; AVX512F-NEXT: movb $4, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1} ; AVX512F-NEXT: movb $36, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpinsrq $0, 16(%rdi), %xmm2, %xmm2 -; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: load_one_mask_bit_set6: ; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $4, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1} ; AVX512VLDQ-NEXT: movb $36, %al ; AVX512VLDQ-NEXT: kmovw %eax, %k1 ; AVX512VLDQ-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} -; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VLDQ-NEXT: vpinsrq $0, 16(%rdi), %xmm2, %xmm2 -; AVX512VLDQ-NEXT: vinserti32x4 $1, %xmm2, %zmm0, %zmm0 ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: load_one_mask_bit_set6: ; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: movb $4, %al +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vpbroadcastq 16(%rdi), %zmm0 {%k1} ; AVX512VLBW-NEXT: movb $36, %al ; AVX512VLBW-NEXT: kmovd %eax, %k1 ; AVX512VLBW-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} -; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VLBW-NEXT: vpinsrq $0, 16(%rdi), %xmm2, %xmm2 -; AVX512VLBW-NEXT: vinserti32x4 $1, %xmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; ; X86-AVX512-LABEL: load_one_mask_bit_set6: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movb $4, %cl +; X86-AVX512-NEXT: kmovd %ecx, %k1 +; X86-AVX512-NEXT: vbroadcastsd 16(%eax), %zmm0 {%k1} ; X86-AVX512-NEXT: movb $36, %cl ; X86-AVX512-NEXT: kmovd %ecx, %k1 ; X86-AVX512-NEXT: vmovdqu64 64(%eax), %zmm1 {%k1} -; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 -; X86-AVX512-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; X86-AVX512-NEXT: vinsertf32x4 $1, %xmm2, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %addr, i32 4, <16 x i1> , <16 x i64> %val) ret <16 x i64> %res From b28cb53eac70b5044b5bd0fef77530fcc6b29fb0 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 16 Aug 2021 12:41:58 -0400 Subject: [PATCH 217/700] [libc++][NFC] Format expression-equivalent wrappers consistently Differential Revision: https://reviews.llvm.org/D108144 --- libcxx/include/__functional/operations.h | 114 +++++++++++------------ libcxx/include/__ranges/iota_view.h | 12 +-- libcxx/include/type_traits | 62 ++++++++---- libcxx/include/variant | 9 +- 4 files changed, 107 insertions(+), 90 deletions(-) diff --git a/libcxx/include/__functional/operations.h b/libcxx/include/__functional/operations.h index 667d17988bc4e..0c7c6d4fcfaf8 100644 --- a/libcxx/include/__functional/operations.h +++ b/libcxx/include/__functional/operations.h @@ -53,9 +53,9 @@ struct _LIBCPP_TEMPLATE_VIS plus template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) + _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) + _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) + _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) + _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) + _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) + _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -90,9 +90,9 @@ struct _LIBCPP_TEMPLATE_VIS minus template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) - _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) - _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) - _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) - _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) - _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) - _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -127,9 +127,9 @@ struct _LIBCPP_TEMPLATE_VIS multiplies template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -164,9 +164,9 @@ struct _LIBCPP_TEMPLATE_VIS divides template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) / _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) / _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) / _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) / _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) / _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) / _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -201,9 +201,9 @@ struct _LIBCPP_TEMPLATE_VIS modulus template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) % _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) % _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) % _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) % _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) % _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) % _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -237,9 +237,9 @@ struct _LIBCPP_TEMPLATE_VIS negate template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_Tp&& __x) const - _NOEXCEPT_(noexcept(- _VSTD::forward<_Tp>(__x))) - -> decltype (- _VSTD::forward<_Tp>(__x)) - { return - _VSTD::forward<_Tp>(__x); } + noexcept(noexcept(- _VSTD::forward<_Tp>(__x))) + -> decltype( - _VSTD::forward<_Tp>(__x)) + { return - _VSTD::forward<_Tp>(__x); } typedef void is_transparent; }; #endif @@ -276,9 +276,9 @@ struct _LIBCPP_TEMPLATE_VIS bit_and template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) & _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) & _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) & _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) & _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) & _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) & _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -307,9 +307,9 @@ struct _LIBCPP_TEMPLATE_VIS bit_not template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_Tp&& __x) const - _NOEXCEPT_(noexcept(~_VSTD::forward<_Tp>(__x))) - -> decltype (~_VSTD::forward<_Tp>(__x)) - { return ~_VSTD::forward<_Tp>(__x); } + noexcept(noexcept(~_VSTD::forward<_Tp>(__x))) + -> decltype( ~_VSTD::forward<_Tp>(__x)) + { return ~_VSTD::forward<_Tp>(__x); } typedef void is_transparent; }; #endif @@ -344,9 +344,9 @@ struct _LIBCPP_TEMPLATE_VIS bit_or template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) | _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) | _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) | _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) | _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) | _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) | _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -381,9 +381,9 @@ struct _LIBCPP_TEMPLATE_VIS bit_xor template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) ^ _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) ^ _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) ^ _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) ^ _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) ^ _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) ^ _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -420,9 +420,9 @@ struct _LIBCPP_TEMPLATE_VIS equal_to template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) == _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) == _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) == _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) == _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) == _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) == _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -457,9 +457,9 @@ struct _LIBCPP_TEMPLATE_VIS not_equal_to template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) != _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) != _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) != _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) != _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) != _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) != _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -494,9 +494,9 @@ struct _LIBCPP_TEMPLATE_VIS less template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) < _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) < _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) < _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) < _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) < _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) < _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -531,9 +531,9 @@ struct _LIBCPP_TEMPLATE_VIS less_equal template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) <= _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) <= _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) <= _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) <= _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) <= _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) <= _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -568,9 +568,9 @@ struct _LIBCPP_TEMPLATE_VIS greater_equal template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) >= _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) >= _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) >= _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) >= _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) >= _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) >= _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -605,9 +605,9 @@ struct _LIBCPP_TEMPLATE_VIS greater template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) > _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) > _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) > _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) > _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) > _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) > _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -644,9 +644,9 @@ struct _LIBCPP_TEMPLATE_VIS logical_and template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) && _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) && _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) && _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) && _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) && _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) && _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif @@ -680,9 +680,9 @@ struct _LIBCPP_TEMPLATE_VIS logical_not template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_Tp&& __x) const - _NOEXCEPT_(noexcept(!_VSTD::forward<_Tp>(__x))) - -> decltype (!_VSTD::forward<_Tp>(__x)) - { return !_VSTD::forward<_Tp>(__x); } + noexcept(noexcept(!_VSTD::forward<_Tp>(__x))) + -> decltype( !_VSTD::forward<_Tp>(__x)) + { return !_VSTD::forward<_Tp>(__x); } typedef void is_transparent; }; #endif @@ -717,9 +717,9 @@ struct _LIBCPP_TEMPLATE_VIS logical_or template _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY auto operator()(_T1&& __t, _T2&& __u) const - _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) || _VSTD::forward<_T2>(__u))) - -> decltype (_VSTD::forward<_T1>(__t) || _VSTD::forward<_T2>(__u)) - { return _VSTD::forward<_T1>(__t) || _VSTD::forward<_T2>(__u); } + noexcept(noexcept(_VSTD::forward<_T1>(__t) || _VSTD::forward<_T2>(__u))) + -> decltype( _VSTD::forward<_T1>(__t) || _VSTD::forward<_T2>(__u)) + { return _VSTD::forward<_T1>(__t) || _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; #endif diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h index 047d8460c268b..bd679cdf35d60 100644 --- a/libcxx/include/__ranges/iota_view.h +++ b/libcxx/include/__ranges/iota_view.h @@ -374,19 +374,15 @@ namespace __iota { _LIBCPP_HIDE_FROM_ABI constexpr auto operator()(_Start&& __start) const noexcept(noexcept(ranges::iota_view(_VSTD::forward<_Start>(__start)))) - -> decltype(ranges::iota_view(_VSTD::forward<_Start>(__start))) - { - return ranges::iota_view(_VSTD::forward<_Start>(__start)); - } + -> decltype( ranges::iota_view(_VSTD::forward<_Start>(__start))) + { return ranges::iota_view(_VSTD::forward<_Start>(__start)); } template _LIBCPP_HIDE_FROM_ABI constexpr auto operator()(_Start&& __start, _Bound&& __bound) const noexcept(noexcept(ranges::iota_view(_VSTD::forward<_Start>(__start), _VSTD::forward<_Bound>(__bound)))) - -> decltype(ranges::iota_view(_VSTD::forward<_Start>(__start), _VSTD::forward<_Bound>(__bound))) - { - return ranges::iota_view(_VSTD::forward<_Start>(__start), _VSTD::forward<_Bound>(__bound)); - } + -> decltype( ranges::iota_view(_VSTD::forward<_Start>(__start), _VSTD::forward<_Bound>(__bound))) + { return ranges::iota_view(_VSTD::forward<_Start>(__start), _VSTD::forward<_Bound>(__bound)); } }; } // namespace __iota diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits index e685a126cc85c..b6e1c20bf397f 100644 --- a/libcxx/include/type_traits +++ b/libcxx/include/type_traits @@ -3811,10 +3811,6 @@ using __enable_if_bullet6 = typename enable_if // fall back - none of the bullets -#define _LIBCPP_INVOKE_RETURN(...) \ - noexcept(noexcept(__VA_ARGS__)) -> decltype(__VA_ARGS__) \ - { return __VA_ARGS__; } - template auto __invoke(__any, _Args&& ...__args) -> __nat; @@ -3828,42 +3824,54 @@ template (__a0).*__f)(static_cast<_Args&&>(__args)...)) + noexcept(noexcept((static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...))) + -> decltype( (static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...)) + { return (static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...); } template > inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR auto __invoke_constexpr(_Fp&& __f, _A0&& __a0, _Args&& ...__args) -_LIBCPP_INVOKE_RETURN((static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...)) + noexcept(noexcept((static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...))) + -> decltype( (static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...)) + { return (static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...); } template > inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 auto __invoke(_Fp&& __f, _A0&& __a0, _Args&& ...__args) -_LIBCPP_INVOKE_RETURN((__a0.get().*__f)(static_cast<_Args&&>(__args)...)) + noexcept(noexcept((__a0.get().*__f)(static_cast<_Args&&>(__args)...))) + -> decltype( (__a0.get().*__f)(static_cast<_Args&&>(__args)...)) + { return (__a0.get().*__f)(static_cast<_Args&&>(__args)...); } template > inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR auto __invoke_constexpr(_Fp&& __f, _A0&& __a0, _Args&& ...__args) -_LIBCPP_INVOKE_RETURN((__a0.get().*__f)(static_cast<_Args&&>(__args)...)) + noexcept(noexcept((__a0.get().*__f)(static_cast<_Args&&>(__args)...))) + -> decltype( (__a0.get().*__f)(static_cast<_Args&&>(__args)...)) + { return (__a0.get().*__f)(static_cast<_Args&&>(__args)...); } template > inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 auto __invoke(_Fp&& __f, _A0&& __a0, _Args&& ...__args) -_LIBCPP_INVOKE_RETURN(((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...)) + noexcept(noexcept(((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...))) + -> decltype( ((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...)) + { return ((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...); } template > inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR auto __invoke_constexpr(_Fp&& __f, _A0&& __a0, _Args&& ...__args) -_LIBCPP_INVOKE_RETURN(((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...)) + noexcept(noexcept(((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...))) + -> decltype( ((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...)) + { return ((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...); } // bullets 4, 5 and 6 @@ -3872,42 +3880,54 @@ template (__a0).*__f) + noexcept(noexcept(static_cast<_A0&&>(__a0).*__f)) + -> decltype( static_cast<_A0&&>(__a0).*__f) + { return static_cast<_A0&&>(__a0).*__f; } template > inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR auto __invoke_constexpr(_Fp&& __f, _A0&& __a0) -_LIBCPP_INVOKE_RETURN(static_cast<_A0&&>(__a0).*__f) + noexcept(noexcept(static_cast<_A0&&>(__a0).*__f)) + -> decltype( static_cast<_A0&&>(__a0).*__f) + { return static_cast<_A0&&>(__a0).*__f; } template > inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 auto __invoke(_Fp&& __f, _A0&& __a0) -_LIBCPP_INVOKE_RETURN(__a0.get().*__f) + noexcept(noexcept(__a0.get().*__f)) + -> decltype( __a0.get().*__f) + { return __a0.get().*__f; } template > inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR auto __invoke_constexpr(_Fp&& __f, _A0&& __a0) -_LIBCPP_INVOKE_RETURN(__a0.get().*__f) + noexcept(noexcept(__a0.get().*__f)) + -> decltype( __a0.get().*__f) + { return __a0.get().*__f; } template > inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 auto __invoke(_Fp&& __f, _A0&& __a0) -_LIBCPP_INVOKE_RETURN((*static_cast<_A0&&>(__a0)).*__f) + noexcept(noexcept((*static_cast<_A0&&>(__a0)).*__f)) + -> decltype( (*static_cast<_A0&&>(__a0)).*__f) + { return (*static_cast<_A0&&>(__a0)).*__f; } template > inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR auto __invoke_constexpr(_Fp&& __f, _A0&& __a0) -_LIBCPP_INVOKE_RETURN((*static_cast<_A0&&>(__a0)).*__f) + noexcept(noexcept((*static_cast<_A0&&>(__a0)).*__f)) + -> decltype( (*static_cast<_A0&&>(__a0)).*__f) + { return (*static_cast<_A0&&>(__a0)).*__f; } // bullet 7 @@ -3915,15 +3935,17 @@ template inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 auto __invoke(_Fp&& __f, _Args&& ...__args) -_LIBCPP_INVOKE_RETURN(static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...)) + noexcept(noexcept(static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...))) + -> decltype( static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...)) + { return static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...); } template inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR auto __invoke_constexpr(_Fp&& __f, _Args&& ...__args) -_LIBCPP_INVOKE_RETURN(static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...)) - -#undef _LIBCPP_INVOKE_RETURN + noexcept(noexcept(static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...))) + -> decltype( static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...)) + { return static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...); } // __invokable template diff --git a/libcxx/include/variant b/libcxx/include/variant index 700e6f3f11514..b0ea73de07670 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -1718,11 +1718,10 @@ inline _LIBCPP_INLINE_VISIBILITY template inline _LIBCPP_INLINE_VISIBILITY -auto swap(variant<_Types...>& __lhs, - variant<_Types...>& __rhs) noexcept(noexcept(__lhs.swap(__rhs))) - -> decltype(__lhs.swap(__rhs)) { - __lhs.swap(__rhs); -} +auto swap(variant<_Types...>& __lhs, variant<_Types...>& __rhs) + noexcept(noexcept(__lhs.swap(__rhs))) + -> decltype( __lhs.swap(__rhs)) + { return __lhs.swap(__rhs); } template struct _LIBCPP_TEMPLATE_VIS hash< From 78cb1adc5c17850e1a1b33f78f5657ca38493c8d Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 17 Aug 2021 09:06:39 -0700 Subject: [PATCH 218/700] [Object] Move llvm-nm's symbol version utility to ELFObjectFile::readDynsymVersions The utility can be reused by llvm-objdump -T. Reviewed By: jhenderson Differential Revision: https://reviews.llvm.org/D108096 --- llvm/include/llvm/Object/ELFObjectFile.h | 4 ++ llvm/lib/Object/ELFObjectFile.cpp | 69 +++++++++++++++++++ llvm/tools/llvm-nm/llvm-nm.cpp | 87 ++---------------------- 3 files changed, 77 insertions(+), 83 deletions(-) diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h index c87a09f86fae1..386dd24f52efc 100644 --- a/llvm/include/llvm/Object/ELFObjectFile.h +++ b/llvm/include/llvm/Object/ELFObjectFile.h @@ -96,6 +96,10 @@ class ELFObjectFileBase : public ObjectFile { std::vector, uint64_t>> getPltAddresses() const; + + /// Returns a vector containing a symbol version for each dynamic symbol. + /// Returns an empty vector if version sections do not exist. + Expected> readDynsymVersions() const; }; class ELFSectionRef : public SectionRef { diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index 9efb28c6da2b3..2d6d25d2688d4 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -654,3 +654,72 @@ ELFObjectFileBase::getPltAddresses() const { } return Result; } + +template +static Expected> +readDynsymVersionsImpl(const ELFFile &EF, + ELFObjectFileBase::elf_symbol_iterator_range Symbols) { + using Elf_Shdr = typename ELFT::Shdr; + const Elf_Shdr *VerSec = nullptr; + const Elf_Shdr *VerNeedSec = nullptr; + const Elf_Shdr *VerDefSec = nullptr; + // The user should ensure sections() can't fail here. + for (const Elf_Shdr &Sec : cantFail(EF.sections())) { + if (Sec.sh_type == ELF::SHT_GNU_versym) + VerSec = &Sec; + else if (Sec.sh_type == ELF::SHT_GNU_verdef) + VerDefSec = &Sec; + else if (Sec.sh_type == ELF::SHT_GNU_verneed) + VerNeedSec = &Sec; + } + if (!VerSec) + return std::vector(); + + Expected, 0>> MapOrErr = + EF.loadVersionMap(VerNeedSec, VerDefSec); + if (!MapOrErr) + return MapOrErr.takeError(); + + std::vector Ret; + size_t I = 0; + for (auto It = Symbols.begin(), E = Symbols.end(); It != E; ++It) { + ++I; + Expected VerEntryOrErr = + EF.template getEntry(*VerSec, I); + if (!VerEntryOrErr) + return createError("unable to read an entry with index " + Twine(I) + + " from " + describe(EF, *VerSec) + ": " + + toString(VerEntryOrErr.takeError())); + + Expected FlagsOrErr = It->getFlags(); + if (!FlagsOrErr) + return createError("unable to read flags for symbol with index " + + Twine(I) + ": " + toString(FlagsOrErr.takeError())); + + bool IsDefault; + Expected VerOrErr = EF.getSymbolVersionByIndex( + (*VerEntryOrErr)->vs_index, IsDefault, *MapOrErr, + (*FlagsOrErr) & SymbolRef::SF_Undefined); + if (!VerOrErr) + return createError("unable to get a version for entry " + Twine(I) + + " of " + describe(EF, *VerSec) + ": " + + toString(VerOrErr.takeError())); + + Ret.push_back({(*VerOrErr).str(), IsDefault}); + } + + return Ret; +} + +Expected> +ELFObjectFileBase::readDynsymVersions() const { + elf_symbol_iterator_range Symbols = getDynamicSymbolIterators(); + if (const auto *Obj = dyn_cast(this)) + return readDynsymVersionsImpl(Obj->getELFFile(), Symbols); + if (const auto *Obj = dyn_cast(this)) + return readDynsymVersionsImpl(Obj->getELFFile(), Symbols); + if (const auto *Obj = dyn_cast(this)) + return readDynsymVersionsImpl(Obj->getELFFile(), Symbols); + return readDynsymVersionsImpl(cast(this)->getELFFile(), + Symbols); +} diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp index ffb427a3f2bdf..f713fb8eb35f9 100644 --- a/llvm/tools/llvm-nm/llvm-nm.cpp +++ b/llvm/tools/llvm-nm/llvm-nm.cpp @@ -1575,90 +1575,11 @@ static void dumpSymbolsFromDLInfoMachO(MachOObjectFile &MachO) { } } -namespace { -struct SymbolVersion { - std::string Name; - bool IsDefault; -}; -} // namespace - -template -static Expected> -readSymbolVersionsELF(const ELFFile &Obj, StringRef FileName, - ELFObjectFileBase::elf_symbol_iterator_range Symbols) { - using Elf_Shdr = typename ELFT::Shdr; - - // We called sections() earlier, so can't fail here. - typename ELFT::ShdrRange SectionsOrErr = cantFail(Obj.sections()); - const Elf_Shdr *SymVerSec = nullptr; - const Elf_Shdr *SymVerNeedSec = nullptr; - const Elf_Shdr *SymVerDefSec = nullptr; - for (const Elf_Shdr &Sec : SectionsOrErr) { - if (Sec.sh_type == ELF::SHT_GNU_versym) - SymVerSec = &Sec; - else if (Sec.sh_type == ELF::SHT_GNU_verdef) - SymVerDefSec = &Sec; - else if (Sec.sh_type == ELF::SHT_GNU_verneed) - SymVerNeedSec = &Sec; - } - - if (!SymVerSec) - return std::vector{}; - - Expected, 0>> MapOrErr = - Obj.loadVersionMap(SymVerNeedSec, SymVerDefSec); - if (!MapOrErr) - return MapOrErr.takeError(); - - std::vector Ret; - size_t I = 0; - for (auto It = Symbols.begin(), E = Symbols.end(); It != E; ++It) { - ++I; - Expected VerEntryOrErr = - Obj.template getEntry(*SymVerSec, I); - if (!VerEntryOrErr) - return createError("unable to read an entry with index " + Twine(I) + - " from " + describe(Obj, *SymVerSec) + ": " + - toString(VerEntryOrErr.takeError())); - - Expected FlagsOrErr = It->getFlags(); - if (!FlagsOrErr) - return createError("unable to read flags for symbol with index " + - Twine(I) + ": " + toString(FlagsOrErr.takeError())); - - bool IsDefault; - Expected VerOrErr = Obj.getSymbolVersionByIndex( - (*VerEntryOrErr)->vs_index, IsDefault, *MapOrErr, - (*FlagsOrErr) & SymbolRef::SF_Undefined); - if (!VerOrErr) - return createError("unable to get a version for entry " + Twine(I) + - " of " + describe(Obj, *SymVerSec) + ": " + - toString(VerOrErr.takeError())); - - Ret.push_back({(*VerOrErr).str(), IsDefault}); - } - - return Ret; -} - -static Expected> -readSymbolVersionsELF(const ELFObjectFileBase &Obj, - ELFObjectFileBase::elf_symbol_iterator_range Symbols) { - if (const auto *ELF = dyn_cast(&Obj)) - return readSymbolVersionsELF(ELF->getELFFile(), Obj.getFileName(), Symbols); - else if (const auto *ELF = dyn_cast(&Obj)) - return readSymbolVersionsELF(ELF->getELFFile(), Obj.getFileName(), Symbols); - else if (const auto *ELF = dyn_cast(&Obj)) - return readSymbolVersionsELF(ELF->getELFFile(), Obj.getFileName(), Symbols); - return readSymbolVersionsELF(cast(&Obj)->getELFFile(), - Obj.getFileName(), Symbols); -} - static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName, StringRef ArchiveName = {}, StringRef ArchitectureName = {}) { auto Symbols = Obj.symbols(); - std::vector SymbolVersions; + std::vector SymbolVersions; if (DynamicSyms) { const auto *E = dyn_cast(&Obj); if (!E) { @@ -1667,8 +1588,8 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName, } Symbols = E->getDynamicSymbolIterators(); - if (Expected> VersionsOrErr = - readSymbolVersionsELF(*E, Symbols)) + if (Expected> VersionsOrErr = + E->readDynsymVersions()) SymbolVersions = std::move(*VersionsOrErr); else WithColor::warning(errs(), ToolName) @@ -1738,7 +1659,7 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName, } if (!SymbolVersions.empty() && !SymbolVersions[I].Name.empty()) S.Name += - (SymbolVersions[I].IsDefault ? "@@" : "@") + SymbolVersions[I].Name; + (SymbolVersions[I].IsVerDef ? "@@" : "@") + SymbolVersions[I].Name; S.Sym = Sym; SymbolList.push_back(S); From 836649e04040251644cdd94d4eb033091e1dc220 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Tue, 17 Aug 2021 08:59:13 -0700 Subject: [PATCH 219/700] Allow setting attributes in build method generated by YAML-gen. Reviewed By: gysit Differential Revision: https://reviews.llvm.org/D108182 --- mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp index 83447f4930170..b95603056da3f 100644 --- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp +++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp @@ -474,11 +474,13 @@ def {0} : LinalgStructuredBase_Op<"{1}", !listconcat([ }]>, OpBuilder< (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs, - "ValueRange":$outputs), + "ValueRange":$outputs, + CArg<"ArrayRef", "{{}">:$attributes), [{{ $_state.addOperands(inputs); $_state.addOperands(outputs); $_state.addTypes(resultTensorTypes); + $_state.addAttributes(attributes); $_state.addAttribute( "operand_segment_sizes", $_builder.getI32VectorAttr({{ From c56b4cfd4b2d74ce3b54fe0b1c5fb557b7c60200 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 17 Aug 2021 09:10:50 -0700 Subject: [PATCH 220/700] [llvm-objdump] -T: print symbol versions Similar to D94907 (llvm-nm -D). The output will match GNU objdump 2.37. Older versions don't use ` (version)` for undefined symbols. Reviewed By: jhenderson Differential Revision: https://reviews.llvm.org/D108097 --- .../llvm-objdump/ELF/dynsym-version.test | 155 ++++++++++++++++++ llvm/tools/llvm-objdump/llvm-objdump.cpp | 27 ++- llvm/tools/llvm-objdump/llvm-objdump.h | 2 + 3 files changed, 180 insertions(+), 4 deletions(-) create mode 100644 llvm/test/tools/llvm-objdump/ELF/dynsym-version.test diff --git a/llvm/test/tools/llvm-objdump/ELF/dynsym-version.test b/llvm/test/tools/llvm-objdump/ELF/dynsym-version.test new file mode 100644 index 0000000000000..e21b3544be854 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/dynsym-version.test @@ -0,0 +1,155 @@ +## Check we print symbol versions, when they are available. + +## Test undefined symbols. +# RUN: yaml2obj %s -o %t-undef.o +# RUN: llvm-objdump -T %t-undef.o 2>&1 | tr '\t' '|' | FileCheck %s \ +# RUN: -DFILE=%t-undef.o --check-prefix=UNDEF --match-full-lines --strict-whitespace + +## version2sym and version3sym are invalid: undefined symbols cannot refer to .gnu.version_d. +## We still check their behaviors. +# UNDEF:DYNAMIC SYMBOL TABLE: +# UNDEF-NEXT:0000000000000000 D *UND*|0000000000000000 localversym +# UNDEF-NEXT:0000000000000000 D *UND*|0000000000000000 globalversym +# UNDEF-NEXT:0000000000000000 D *UND*|0000000000000000 (v2) version2sym +# UNDEF-NEXT:0000000000000000 D *UND*|0000000000000000 (v3hidden) version3sym +# UNDEF-NEXT:0000000000000000 D *UND*|0000000000000000 (v4) version4sym +# UNDEF-NEXT:0000000000000000 D *UND*|0000000000000000 (v5hidden) .hidden version5sym + +## Test defined symbols. +# RUN: yaml2obj -DINDEX=0x1 %s -o %t-def.o +# RUN: llvm-objdump -T %t-def.o 2>&1 | tr '\t' '|' | FileCheck %s \ +# RUN: -DFILE=%t-def.o --check-prefix=DEF --match-full-lines --strict-whitespace + +# DEF:DYNAMIC SYMBOL TABLE: +# DEF-NEXT:0000000000000000 g D .gnu.version|0000000000000000 localversym +# DEF-NEXT:0000000000000000 g D .gnu.version|0000000000000000 globalversym +# DEF-NEXT:0000000000000000 g D .gnu.version|0000000000000000 v2 version2sym +# DEF-NEXT:0000000000000000 g D .gnu.version|0000000000000000 (v3hidden) version3sym +# DEF-NEXT:0000000000000000 g D .gnu.version|0000000000000000 (v4) version4sym +# DEF-NEXT:0000000000000000 g D .gnu.version|0000000000000000 (v5hidden) .hidden version5sym + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN +Sections: + - Name: .gnu.version + Type: SHT_GNU_versym + Flags: [ SHF_ALLOC ] +## 0x8000 is a special VERSYM_HIDDEN bit. + Entries: [ 0, 0, 1, 2, 0x8003, 4, 0x8005 ] + ShSize: [[VERSYMSIZE=]] + - Name: .gnu.version_d + Type: SHT_GNU_verdef + Flags: [ SHF_ALLOC ] + Link: .dynstr + AddressAlign: 0x4 + Info: 0x2 + ShOffset: [[VERDEFOFFSET=]] + Entries: + - VersionNdx: 2 + Names: + - v2 + - VersionNdx: 3 + Names: + - v3hidden + - Name: .gnu.version_r + Type: SHT_GNU_verneed + Flags: [ SHF_ALLOC ] + Link: .dynstr + Info: 0x2 + Dependencies: + - Version: 1 + File: file1.so + Entries: + - Name: v4 + Hash: 0 + Flags: 0 + Other: 4 + - Version: 1 + File: file2.0 + Entries: + - Name: v5hidden + Hash: 0 + Flags: 0 + Other: 5 + - Name: .dynsym + Type: SHT_DYNSYM + EntSize: [[ENTSIZE=]] +DynamicSymbols: + - Name: localversym + Index: [[INDEX=]] + Binding: STB_GLOBAL + - Name: globalversym + Index: [[INDEX=]] + Binding: STB_GLOBAL + - Name: version2sym + Index: [[INDEX=]] + Binding: STB_GLOBAL + - Name: version3sym + Index: [[INDEX=]] + Binding: STB_GLOBAL + - Name: version4sym + Index: [[INDEX=]] + Binding: STB_GLOBAL + - Name: version5sym + Index: [[INDEX=]] + Other: [ STV_HIDDEN ] + Binding: STB_GLOBAL + +## Test the output with a long version name. +# RUN: yaml2obj --docnum=2 %s -o %t2 +# RUN: llvm-objdump -T %t2 2>&1 | tr '\t' '|' | FileCheck %s \ +# RUN: --check-prefix=LONGNAME --match-full-lines --strict-whitespace + +# LONGNAME:DYNAMIC SYMBOL TABLE: +# LONGNAME-NEXT:0000000000000000 g D .gnu.version|0000000000000000 v2 sym1 +# LONGNAME-NEXT:0000000000000000 g D .gnu.version|0000000000000000 v3withverylongname sym2 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN +Sections: + - Name: .gnu.version + Type: SHT_GNU_versym + Flags: [ SHF_ALLOC ] + Entries: [ 1, 2, 3 ] + - Name: .gnu.version_d + Type: SHT_GNU_verdef + Flags: [ SHF_ALLOC ] + Link: .dynstr + AddressAlign: 0x4 + Info: 0x2 + Entries: + - VersionNdx: 2 + Names: + - v2 + - VersionNdx: 3 + Names: + - v3withverylongname + - Name: .dynsym + Type: SHT_DYNSYM +DynamicSymbols: + - Name: sym1 + Index: 1 + Binding: STB_GLOBAL + - Name: sym2 + Index: 1 + Binding: STB_GLOBAL + +## Check we report a warning when we are unable to read a SHT_GNU_versym section entry. +## In this case, the section has a size that is not a multiple of its sh_entsize. + +# RUN: yaml2obj -DVERSYMSIZE=0xff %s -o %t2-broken-versym.o +# RUN: llvm-objdump -T %t2-broken-versym.o 2>&1 | FileCheck %s --check-prefixes=VERSION-ERR1,NOVER + +# VERSION-ERR1:warning: {{.*}}: unable to read an entry with index 1 from SHT_GNU_versym section +# NOVER-NEXT:0000000000000000 D *UND* 0000000000000000 localversym +# NOVER-NEXT:0000000000000000 D *UND* 0000000000000000 globalversym +# NOVER-NEXT:0000000000000000 D *UND* 0000000000000000 version2sym +# NOVER-NEXT:0000000000000000 D *UND* 0000000000000000 version3sym +# NOVER-NEXT:0000000000000000 D *UND* 0000000000000000 version4sym +# NOVER-NEXT:0000000000000000 D *UND* 0000000000000000 .hidden version5sym diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp index a062957cc7430..330597d04f0e5 100644 --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -1921,7 +1921,8 @@ void objdump::printSymbolTable(const ObjectFile *O, StringRef ArchiveName, if (!DumpDynamic) { outs() << "\nSYMBOL TABLE:\n"; for (auto I = O->symbol_begin(); I != O->symbol_end(); ++I) - printSymbol(O, *I, FileName, ArchiveName, ArchitectureName, DumpDynamic); + printSymbol(O, *I, {}, FileName, ArchiveName, ArchitectureName, + DumpDynamic); return; } @@ -1934,12 +1935,21 @@ void objdump::printSymbolTable(const ObjectFile *O, StringRef ArchiveName, } const ELFObjectFileBase *ELF = cast(O); - for (auto I = ELF->getDynamicSymbolIterators().begin(); - I != ELF->getDynamicSymbolIterators().end(); ++I) - printSymbol(O, *I, FileName, ArchiveName, ArchitectureName, DumpDynamic); + auto Symbols = ELF->getDynamicSymbolIterators(); + Expected> SymbolVersionsOrErr = + ELF->readDynsymVersions(); + if (!SymbolVersionsOrErr) { + reportWarning(toString(SymbolVersionsOrErr.takeError()), FileName); + SymbolVersionsOrErr = std::vector(); + (void)!SymbolVersionsOrErr; + } + for (auto &Sym : Symbols) + printSymbol(O, Sym, *SymbolVersionsOrErr, FileName, ArchiveName, + ArchitectureName, DumpDynamic); } void objdump::printSymbol(const ObjectFile *O, const SymbolRef &Symbol, + ArrayRef SymbolVersions, StringRef FileName, StringRef ArchiveName, StringRef ArchitectureName, bool DumpDynamic) { const MachOObjectFile *MachO = dyn_cast(O); @@ -2044,6 +2054,15 @@ void objdump::printSymbol(const ObjectFile *O, const SymbolRef &Symbol, } if (O->isELF()) { + if (!SymbolVersions.empty()) { + const VersionEntry &Ver = + SymbolVersions[Symbol.getRawDataRefImpl().d.b - 1]; + std::string Str; + if (!Ver.Name.empty()) + Str = Ver.IsVerDef ? ' ' + Ver.Name : '(' + Ver.Name + ')'; + outs() << ' ' << left_justify(Str, 12); + } + uint8_t Other = ELFSymbolRef(Symbol).getOther(); switch (Other) { case ELF::STV_DEFAULT: diff --git a/llvm/tools/llvm-objdump/llvm-objdump.h b/llvm/tools/llvm-objdump/llvm-objdump.h index 3796878558dec..d9fc3bfe66a5d 100644 --- a/llvm/tools/llvm-objdump/llvm-objdump.h +++ b/llvm/tools/llvm-objdump/llvm-objdump.h @@ -26,6 +26,7 @@ class ELFSectionRef; class MachOObjectFile; class MachOUniversalBinary; class RelocationRef; +struct VersionEntry; } // namespace object namespace objdump { @@ -137,6 +138,7 @@ void printSymbolTable(const object::ObjectFile *O, StringRef ArchiveName, StringRef ArchitectureName = StringRef(), bool DumpDynamic = false); void printSymbol(const object::ObjectFile *O, const object::SymbolRef &Symbol, + ArrayRef SymbolVersions, StringRef FileName, StringRef ArchiveName, StringRef ArchitectureName, bool DumpDynamic); [[noreturn]] void reportError(StringRef File, const Twine &Message); From 6d5e31baaa8d7141777c7937051577e0f90d230c Mon Sep 17 00:00:00 2001 From: Tozer Date: Tue, 17 Aug 2021 17:09:44 +0100 Subject: [PATCH 221/700] Fix 2: [MCParser] Correctly handle CRLF line ends when consuming line comments Fixes an issue with revision 5c6f748c and ad40cb88. Adds an mcpu argument to the test command, preventing an invalid default CPU from being used on some platforms. --- llvm/test/tools/llvm-mca/X86/directives-handle-crlf.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/tools/llvm-mca/X86/directives-handle-crlf.s b/llvm/test/tools/llvm-mca/X86/directives-handle-crlf.s index e62cdd269bc8f..db797a75c2617 100644 --- a/llvm/test/tools/llvm-mca/X86/directives-handle-crlf.s +++ b/llvm/test/tools/llvm-mca/X86/directives-handle-crlf.s @@ -1,4 +1,4 @@ -# RUN: llvm-mca -mtriple=x86_64-unknown-unknown %s +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=generic %s # LLVM-MCA-BEGIN foo addl $42, %eax # LLVM-MCA-END From 9dabacd09fdd52b5995546794290651c477d3885 Mon Sep 17 00:00:00 2001 From: Denys Petrov Date: Tue, 3 Aug 2021 19:09:00 +0300 Subject: [PATCH 222/700] [analyzer] Adjust JS code of analyzer's HTML report for IE support. Summary: Change and replace some functions which IE does not support. This patch is made as a continuation of D92928 revision. Also improve hot keys behavior. Differential Revision: https://reviews.llvm.org/D107366 --- .../StaticAnalyzer/Core/HTMLDiagnostics.cpp | 40 +++++++++++++++---- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp index 3ee12c0bdf651..c90046ffb4131 100644 --- a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp +++ b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp @@ -1289,15 +1289,32 @@ var findNum = function() { return out; }; +var classListAdd = function(el, theClass) { + if(!el.className.baseVal) + el.className += " " + theClass; + else + el.className.baseVal += " " + theClass; +}; + +var classListRemove = function(el, theClass) { + var className = (!el.className.baseVal) ? + el.className : el.className.baseVal; + className = className.replace(" " + theClass, ""); + if(!el.className.baseVal) + el.className = className; + else + el.className.baseVal = className; +}; + var scrollTo = function(el) { querySelectorAllArray(".selected").forEach(function(s) { - s.classList.remove("selected"); + classListRemove(s, "selected"); }); - el.classList.add("selected"); + classListAdd(el, "selected"); window.scrollBy(0, el.getBoundingClientRect().top - (window.innerHeight / 2)); highlightArrowsForSelectedEvent(); -} +}; var move = function(num, up, numItems) { if (num == 1 && up || num == numItems - 1 && !up) { @@ -1332,9 +1349,11 @@ window.addEventListener("keydown", function (event) { if (event.defaultPrevented) { return; } - if (event.key == "j") { + // key 'j' + if (event.keyCode == 74) { navigateTo(/*up=*/false); - } else if (event.key == "k") { + // key 'k' + } else if (event.keyCode == 75) { navigateTo(/*up=*/true); } else { return; @@ -1350,8 +1369,11 @@ StringRef HTMLDiagnostics::generateArrowDrawingJavascript() {