diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index 654a5f10cea96..370c301731337 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -70,6 +70,9 @@ enum class RecurKind { FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of ///< (x,y) is increasing loop induction, and both x and y ///< are integer type, producing a UMax reduction. + FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y + ///< are an integer type, one is the current recurrence value, + ///< and the other is an arbitrary value. // clang-format on // TODO: Any_of and FindLast reduction need not be restricted to integer type // only. @@ -175,13 +178,12 @@ class RecurrenceDescriptor { /// Returns a struct describing whether the instruction is either a /// Select(ICmp(A, B), X, Y), or /// Select(FCmp(A, B), X, Y) - /// where one of (X, Y) is an increasing (FindLast) or decreasing (FindFirst) - /// loop induction variable, and the other is a PHI value. - // TODO: Support non-monotonic variable. FindLast does not need be restricted - // to increasing loop induction variables. - LLVM_ABI static InstDesc isFindIVPattern(RecurKind Kind, Loop *TheLoop, - PHINode *OrigPhi, Instruction *I, - ScalarEvolution &SE); + /// where one of (X, Y) is an increasing (FindLastIV) or decreasing + /// (FindFirstIV) loop induction variable, or an arbitrary integer value + /// (FindLast), and the other is a PHI value. + LLVM_ABI static InstDesc isFindPattern(RecurKind Kind, Loop *TheLoop, + PHINode *OrigPhi, Instruction *I, + ScalarEvolution &SE); /// Returns a struct describing if the instruction is a /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern. @@ -305,6 +307,13 @@ class RecurrenceDescriptor { isFindLastIVRecurrenceKind(Kind); } + /// Returns true if the recurrence kind is of the form + /// select(cmp(),x,y) where one of (x,y) is an arbitrary value and the + /// other is a recurrence. + static bool isFindLastRecurrenceKind(RecurKind Kind) { + return Kind == RecurKind::FindLast; + } + /// Returns the type of the recurrence. This type can be narrower than the /// actual type of the Phi if the recurrence has been type-promoted. Type *getRecurrenceType() const { return RecurrenceType; } diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 9f8ac6e8e2e0b..7705c0459d829 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -56,6 +56,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + // TODO: Make type-agnostic. + case RecurKind::FindLast: return true; } return false; @@ -691,9 +693,9 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi, // value of the data type or a non-constant value by using mask and multiple // reduction operations. RecurrenceDescriptor::InstDesc -RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, - PHINode *OrigPhi, Instruction *I, - ScalarEvolution &SE) { +RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop, + PHINode *OrigPhi, Instruction *I, + ScalarEvolution &SE) { // TODO: Support the vectorization of FindLastIV when the reduction phi is // used by more than one select instruction. This vectorization is only // performed when the SCEV of each increasing induction variable used by the @@ -702,8 +704,10 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, return InstDesc(false, I); // We are looking for selects of the form: - // select(cmp(), phi, loop_induction) or - // select(cmp(), loop_induction, phi) + // select(cmp(), phi, value) or + // select(cmp(), value, phi) + // where 'value' is be a loop induction variable + // (for FindFirstIV/FindLastIV) or an arbitrary value (for FindLast). // TODO: Match selects with multi-use cmp conditions. Value *NonRdxPhi = nullptr; if (!match(I, m_CombineOr(m_Select(m_OneUse(m_Cmp()), m_Value(NonRdxPhi), @@ -712,6 +716,25 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, m_Value(NonRdxPhi))))) return InstDesc(false, I); + if (isFindLastRecurrenceKind(Kind)) { + // Must be an integer scalar. + Type *Type = OrigPhi->getType(); + if (!Type->isIntegerTy()) + return InstDesc(false, I); + + // FIXME: Support more complex patterns, including multiple selects. + // The Select must be used only outside the loop and by the PHI. + for (User *U : I->users()) { + if (U == OrigPhi) + continue; + if (auto *UI = dyn_cast(U); UI && !TheLoop->contains(UI)) + continue; + return InstDesc(false, I); + } + + return InstDesc(I, RecurKind::FindLast); + } + // Returns either FindFirstIV/FindLastIV, if such a pattern is found, or // std::nullopt. auto GetRecurKind = [&](Value *V) -> std::optional { @@ -920,8 +943,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( Kind == RecurKind::Add || Kind == RecurKind::Mul || Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs) return isConditionalRdxPattern(I); - if (isFindIVRecurrenceKind(Kind) && SE) - return isFindIVPattern(Kind, L, OrigPhi, I, *SE); + if ((isFindIVRecurrenceKind(Kind) || isFindLastRecurrenceKind(Kind)) && SE) + return isFindPattern(Kind, L, OrigPhi, I, *SE); [[fallthrough]]; case Instruction::FCmp: case Instruction::ICmp: @@ -1118,7 +1141,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, << "\n"); return true; } - + if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC, + DT, SE)) { + LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n"); + return true; + } // Not a reduction of known type. return false; } @@ -1248,6 +1275,8 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { case RecurKind::FMaximumNum: case RecurKind::FMinimumNum: return Instruction::FCmp; + case RecurKind::FindLast: + return Instruction::Select; default: llvm_unreachable("Unknown recurrence operation"); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 197aae6e03cb1..b27dcbf7acd79 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5451,6 +5451,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction( case RecurKind::FMax: case RecurKind::FMulAdd: case RecurKind::AnyOf: + case RecurKind::FindLast: return true; default: return false; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b7224a33f47b1..0bbad6f657a24 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1275,6 +1275,7 @@ class LoopVectorizationCostModel { "from latch block\n"); return true; } + if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " "interleaved group requires scalar epilogue\n"); @@ -4045,6 +4046,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, continue; case VPDef::VPReductionSC: case VPDef::VPActiveLaneMaskPHISC: + case VPDef::VPLastActiveMaskPHISC: case VPDef::VPWidenCallSC: case VPDef::VPWidenCanonicalIVSC: case VPDef::VPWidenCastSC: @@ -4265,11 +4267,15 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( ElementCount VF) const { // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum // reductions need special handling and are currently unsupported. + // FindLast reductions also require special handling for the synthesized + // mask PHI. if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { if (!Legal->isReductionVariable(&Phi)) return Legal->isFixedOrderRecurrence(&Phi); - return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( - Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind()); + RecurKind Kind = + Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind(); + return RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) || + RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind); })) return false; @@ -4559,6 +4565,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), IsaPred); + // FIXME: implement interleaving for FindLast transform correctly. + for (auto &[_, RdxDesc] : Legal->getReductionVars()) + if (RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind())) + return 1; + // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { @@ -8488,6 +8500,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( *Plan, Builder)) return nullptr; + // Create whole-vector selects for find-last recurrences. + VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences, *Plan, + RecipeBuilder, Legal); + if (useActiveLaneMask(Style)) { // TODO: Move checks to VPlanTransforms::addActiveLaneMask once // TailFoldingStyle is visible there. @@ -8581,10 +8597,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( continue; RecurKind Kind = PhiR->getRecurrenceKind(); - assert( - !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && - !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) && - "AnyOf and FindIV reductions are not allowed for in-loop reductions"); + assert(!RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) && + !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && + !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) && + "AnyOf, FindIV, and FindLast reductions are not allowed for in-loop " + "reductions"); // Collect the chain of "link" recipes for the reduction starting at PhiR. SetVector Worklist; @@ -8884,7 +8901,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( RecurKind RK = RdxDesc.getRecurrenceKind(); if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) && !RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && - !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) { + !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) && + !RecurrenceDescriptor::isFindLastRecurrenceKind(RK))) { VPBuilder PHBuilder(Plan->getVectorPreheader()); VPValue *Iden = Plan->getOrAddLiveIn( getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags())); @@ -9294,7 +9312,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { SmallPtrSet EpiWidenedPhis; for (VPRecipeBase &R : EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - if (isa(&R)) + if (isa(&R)) continue; EpiWidenedPhis.insert( cast(R.getVPSingleValue()->getUnderlyingValue())); @@ -9491,6 +9509,10 @@ static SmallVector preparePlanForEpilogueVectorLoop( continue; } } + } else if (isa(R)) { + // LastActiveMasks are only used as part of FindLast reductions, + // and aren't passed to the scalar loop. + continue; } else { // Retrieve the induction resume values for wide inductions from // their original phi nodes in the scalar loop. @@ -10006,6 +10028,21 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Override IC if user provided an interleave count. IC = UserIC > 0 ? UserIC : IC; + // FIXME: Enable interleaving for last_active reductions. + if (any_of(make_second_range(LVL.getReductionVars()), [&](auto &RdxDesc) { + return RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind()); + })) { + LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due " + << "to conditional scalar assignments.\n"); + IntDiagMsg = { + "ConditionalAssignmentPreventsScalarInterleaving", + "Unable to interleave without vectorization due to conditional " + "assignments"}; + InterleaveLoop = false; + IC = 1; + } + // Emit diagnostic messages, if any. const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index df835a077f2a0..c6be736b41509 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -25174,6 +25174,7 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FindLast: case RecurKind::FMaxNum: case RecurKind::FMinNum: case RecurKind::FMaximumNum: @@ -25315,6 +25316,7 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FindLast: case RecurKind::FMaxNum: case RecurKind::FMinNum: case RecurKind::FMaximumNum: @@ -25421,6 +25423,7 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FindLast: case RecurKind::FMaxNum: case RecurKind::FMinNum: case RecurKind::FMaximumNum: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 5851b3ab7978c..f0fe89ca9133d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -548,6 +548,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPPredInstPHISC: case VPRecipeBase::VPCanonicalIVPHISC: case VPRecipeBase::VPActiveLaneMaskPHISC: + case VPRecipeBase::VPLastActiveMaskPHISC: case VPRecipeBase::VPFirstOrderRecurrencePHISC: case VPRecipeBase::VPWidenPHISC: case VPRecipeBase::VPWidenIntOrFpInductionSC: @@ -1064,6 +1065,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, /// Returns the value for vscale. VScale, OpsEnd = VScale, + /// Extracts the last active lane based on a predicate vector operand. + ExtractLastActive, }; /// Returns true if this VPInstruction generates scalar values for all lanes. @@ -3556,6 +3559,40 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { #endif }; +// TODO: Can we unify the PHI recipe hierarchy a bit? VPPredInstPHISC is close +// to this (just a PHI of a predicate), but isn't a header phi so can't +// be used for the mask of FindLastActive reductions. +// +// This is basically a clone of VPActiveLaneMaskPHIRecipe, but won't run into +// problems with transforms that expect there to only be a single ALM PHI, and +// can be ignored by other code looking for a (non-existent) underlying value. +class VPLastActiveMaskPHIRecipe : public VPHeaderPHIRecipe { +public: + VPLastActiveMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) + : VPHeaderPHIRecipe(VPDef::VPLastActiveMaskPHISC, nullptr, StartMask, + DL) {} + + ~VPLastActiveMaskPHIRecipe() override = default; + + VPLastActiveMaskPHIRecipe *clone() override { + auto *R = new VPLastActiveMaskPHIRecipe(getOperand(0), getDebugLoc()); + if (getNumOperands() == 2) + R->addOperand(getOperand(1)); + return R; + } + + VP_CLASSOF_IMPL(VPDef::VPLastActiveMaskPHISC); + + /// Generate the mask phi + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for generating the phi node for the current index of elements, /// adjusted in accordance with EVL value. It starts at the start value of the /// canonical induction and gets incremented by EVL in each iteration of the diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 80a2e4bc3f754..3168d6652f03b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -118,7 +118,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return Type::getIntNTy(Ctx, 64); case VPInstruction::ExtractLastElement: case VPInstruction::ExtractLastLanePerPart: - case VPInstruction::ExtractPenultimateElement: { + case VPInstruction::ExtractPenultimateElement: + case VPInstruction::ExtractLastActive: { Type *BaseTy = inferScalarType(R->getOperand(0)); if (auto *VecTy = dyn_cast(BaseTy)) return VecTy->getElementType(); @@ -276,14 +277,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { TypeSwitch(V->getDefiningRecipe()) .Case( - [this](const auto *R) { - // Handle header phi recipes, except VPWidenIntOrFpInduction - // which needs special handling due it being possibly truncated. - // TODO: consider inferring/caching type of siblings, e.g., - // backedge value, here and in cases below. - return inferScalarType(R->getStartValue()); - }) + VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe, + VPLastActiveMaskPHIRecipe>([this](const auto *R) { + // Handle header phi recipes, except VPWidenIntOrFpInduction + // which needs special handling due it being possibly truncated. + // TODO: consider inferring/caching type of siblings, e.g., + // backedge value, here and in cases below. + return inferScalarType(R->getStartValue()); + }) .Case( [](const auto *R) { return R->getScalarType(); }) .CasegetType(); + + Module *M = State.Builder.GetInsertBlock()->getModule(); + Function *ExtractLast = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::experimental_vector_extract_last_active, {VTy}); + return Builder.CreateCall(ExtractLast, {Data, Mask, Default}); + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -1141,6 +1154,15 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); } + case VPInstruction::ExtractLastActive: { + Type *ScalarTy = Ctx.Types.inferScalarType(this); + Type *VecTy = toVectorTy(ScalarTy, VF); + Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); + IntrinsicCostAttributes ICA( + Intrinsic::experimental_vector_extract_last_active, ScalarTy, + {VecTy, MaskTy, ScalarTy}); + return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind); + } case VPInstruction::FirstOrderRecurrenceSplice: { assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?"); SmallVector Mask(VF.getKnownMinValue()); @@ -1197,6 +1219,7 @@ bool VPInstruction::isVectorToScalar() const { getOpcode() == VPInstruction::FirstActiveLane || getOpcode() == VPInstruction::ComputeAnyOfResult || getOpcode() == VPInstruction::ComputeFindIVResult || + getOpcode() == VPInstruction::ExtractLastActive || getOpcode() == VPInstruction::ComputeReductionResult || getOpcode() == VPInstruction::AnyOf; } @@ -1260,6 +1283,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::ExtractPenultimateElement: case VPInstruction::ActiveLaneMask: case VPInstruction::FirstActiveLane: + case VPInstruction::ExtractLastActive: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::Not: @@ -1445,6 +1469,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::Unpack: O << "unpack"; break; + case VPInstruction::ExtractLastActive: + O << "extract-last-active"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -4483,6 +4510,27 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPLastActiveMaskPHIRecipe::execute(VPTransformState &State) { + BasicBlock *VectorPH = + State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); + Value *StartMask = State.get(getOperand(0)); + PHINode *Phi = + State.Builder.CreatePHI(StartMask->getType(), 2, "last.active.mask"); + Phi->addIncoming(StartMask, VectorPH); + State.set(this, Phi); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPLastActiveMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "LAST-ACTIVE-MASK-PHI "; + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 48bd697397f41..8fbd3686af117 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -36,6 +36,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/TypeSize.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" using namespace llvm; using namespace VPlanPatternMatch; @@ -4593,3 +4594,80 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan, } } } + +void VPlanTransforms::convertFindLastRecurrences( + VPlan &Plan, VPRecipeBuilder &RecipeBuilder, + LoopVectorizationLegality *Legal) { + assert(Legal && "Need valid LoopVecLegality"); + + // May need to do something better than this? + if (Plan.hasScalarVFOnly()) + return; + + // We want to create the following nodes: + // vec.body: + // mask.phi = phi [ all.false, vec.ph ], [ new.mask, vec.body ] + // ...data.phi already exists, but needs updating... + // data.phi = phi [ default.val, vec.ph ], [ new.data, vec.body ] + // + // ...'data' and 'compare' created by existing nodes... + // + // any_active = i1 any_of_reduction(compare) + // new.mask = select any_active, compare, mask.phi + // new.data = select any_active, data, data.phi + // + // middle.block: + // result = extract-last-active new.data, new.mask, default.val + + for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) { + if (RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + VPRecipeBase *PhiR = RecipeBuilder.getRecipe(Phi); + VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); + + // Add mask phi + VPValue *False = + Plan.getOrAddLiveIn(ConstantInt::getFalse(Phi->getContext())); + auto *MaskPHI = new VPLastActiveMaskPHIRecipe(False, DebugLoc()); + Builder.insert(MaskPHI); + + // Find the condition for the select + SelectInst *Select = cast(RdxDesc.getLoopExitInstr()); + auto *SR = cast(RecipeBuilder.getRecipe(Select)); + VPValue *Cond = SR->getCond(); + + // Add select for mask + Builder.setInsertPoint(SR); + VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond}); + VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI); + MaskPHI->addOperand(MaskSelect); + + // Replace select for data + VPValue *DataSelect = Builder.createSelect( + AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc()); + SR->replaceAllUsesWith(DataSelect); + SR->eraseFromParent(); + + // Find final reduction and replace it with an + // extract.last.active intrinsic. + VPInstruction *RdxResult = nullptr; + for (VPUser *U : DataSelect->users()) { + VPInstruction *I = dyn_cast(U); + if (I && I->getOpcode() == VPInstruction::ComputeReductionResult) { + RdxResult = I; + break; + } + } + + assert(RdxResult); + Builder.setInsertPoint(RdxResult); + VPValue *Default = RecipeBuilder.getVPValueOrAddLiveIn( + RdxDesc.getRecurrenceStartValue()); + auto *ExtractLastActive = Builder.createNaryOp( + VPInstruction::ExtractLastActive, {DataSelect, MaskSelect, Default}, + RdxResult->getDebugLoc()); + RdxResult->replaceAllUsesWith(ExtractLastActive); + RdxResult->eraseFromParent(); + } + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index b28559b620e13..71aa6f9457c1c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -23,6 +23,7 @@ namespace llvm { class InductionDescriptor; class Instruction; +class LoopVectorizationLegality; class PHINode; class ScalarEvolution; class PredicatedScalarEvolution; @@ -377,6 +378,13 @@ struct VPlanTransforms { /// users in the original exit block using the VPIRInstruction wrapping to the /// LCSSA phi. static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range); + + /// Change FindLast reductions to save the appropriate state using selects + /// for entire vectors for both the latest mask containing at least one active + /// element and the corresponding data vector. + static void convertFindLastRecurrences(VPlan &Plan, + VPRecipeBuilder &RecipeBuilder, + LoopVectorizationLegality *Legal); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index e22c5dfdb9f38..67cf86b16c6fb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -47,7 +47,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { } bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { - if (isa(V)) + if (isa(V)) return true; auto IsWideCanonicalIV = [](VPValue *A) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 5da74630ef626..7a3f4f63a09a1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -366,6 +366,7 @@ class VPDef { // VPHeaderPHIRecipe need to be kept together. VPCanonicalIVPHISC, VPActiveLaneMaskPHISC, + VPLastActiveMaskPHISC, VPEVLBasedIVPHISC, VPFirstOrderRecurrencePHISC, VPWidenIntOrFpInductionSC, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll new file mode 100644 index 0000000000000..25c698f3df245 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll @@ -0,0 +1,397 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize,instcombine -S < %s 2>&1 | FileCheck %s --check-prefix=NEON +; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE + +;; The following run line caused an ICE before using a dedicated FindLast PHI recipe. +;; We're not looking at the resulting IR, just confirming it doesn't crash. +; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 > /dev/null + +target triple = "aarch64-linux-gnu" + +define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { +; NEON-LABEL: define i32 @simple_csa_int_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: ret i32 [[SELECT_DATA]] +; +; SVE-LABEL: define i32 @simple_csa_int_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: [[A_FR:%.*]] = freeze i32 [[A]] +; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SVE: [[VECTOR_PH]]: +; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement zeroinitializer, i32 [[A_FR]], i64 0 +; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; SVE: [[VECTOR_BODY]]: +; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]] +; SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; SVE-NEXT: [[WIDE_LOAD_FR:%.*]] = freeze [[WIDE_LOAD]] +; SVE-NEXT: [[TMP7:%.*]] = icmp slt [[BROADCAST_SPLAT]], [[WIDE_LOAD_FR]] +; SVE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP7]]) +; SVE-NEXT: [[TMP9]] = select i1 [[TMP8]], [[TMP7]], [[LAST_ACTIVE_MASK]] +; SVE-NEXT: [[TMP10]] = select i1 [[TMP8]], [[WIDE_LOAD_FR]], [[VEC_PHI]] +; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; SVE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SVE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SVE: [[MIDDLE_BLOCK]]: +; SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( [[TMP10]], [[TMP9]], i32 -1) +; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; SVE: [[SCALAR_PH]]: +; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A_FR]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] +; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + +define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) { +; NEON-LABEL: define ptr @simple_csa_ptr_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw ptr, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: ret ptr [[SELECT_DATA]] +; +; SVE-LABEL: define ptr @simple_csa_ptr_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw ptr, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: ret ptr [[SELECT_DATA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi ptr [ %init, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds ptr, ptr %data, i64 %iv + %ld = load ptr, ptr %ld.addr, align 4 + %ld.i64 = ptrtoint ptr %ld to i64 + %select.cmp = icmp slt i64 %a, %ld.i64 + %select.data = select i1 %select.cmp, ptr %ld, ptr %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret ptr %select.data +} + +define float @simple_csa_float_select(i64 %N, ptr %data, float %a) { +; NEON-LABEL: define float @simple_csa_float_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: ret float [[SELECT_DATA]] +; +; SVE-LABEL: define float @simple_csa_float_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: ret float [[SELECT_DATA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi float [ -1.0, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds float, ptr %data, i64 %iv + %ld = load float, ptr %ld.addr, align 4 + %select.cmp = fcmp olt float %a, %ld + %select.data = select i1 %select.cmp, float %ld, float %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret float %select.data +} + +define i32 @multi_user_csa_int_select(i64 %N, ptr %data, ptr %results, i32 %a) { +; NEON-LABEL: define i32 @multi_user_csa_int_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; NEON-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[RESULTS]], i64 [[IV]] +; NEON-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4 +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: ret i32 [[SELECT_DATA]] +; +; SVE-LABEL: define i32 @multi_user_csa_int_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; SVE-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[RESULTS]], i64 [[IV]] +; SVE-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4 +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: ret i32 [[SELECT_DATA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %res.addr = getelementptr inbounds i32, ptr %results, i64 %iv + store i32 %select.data, ptr %res.addr, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + + +define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) { +; NEON-LABEL: define i32 @multi_use_cmp_for_csa_int_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; NEON-NEXT: [[SELECT_IDX]] = select i1 [[SELECT_CMP]], i64 [[IV]], i64 [[IDX_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX]] to i32 +; NEON-NEXT: [[RES:%.*]] = add i32 [[SELECT_DATA]], [[IDX]] +; NEON-NEXT: ret i32 [[RES]] +; +; SVE-LABEL: define i32 @multi_use_cmp_for_csa_int_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; SVE-NEXT: [[SELECT_IDX]] = select i1 [[SELECT_CMP]], i64 [[IV]], i64 [[IDX_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX]] to i32 +; SVE-NEXT: [[RES:%.*]] = add i32 [[SELECT_DATA]], [[IDX]] +; SVE-NEXT: ret i32 [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %idx.phi = phi i64 [ -1, %entry ], [ %select.idx, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %select.idx = select i1 %select.cmp, i64 %iv, i64 %idx.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + %idx = trunc i64 %select.idx to i32 + %res = add i32 %idx, %select.data + ret i32 %res +} + + +define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i32 %a, i32 %b) { +; NEON-LABEL: define i32 @chained_select_for_csa_int_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA1]], i64 [[IV]] +; NEON-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]] +; NEON-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]] +; NEON-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA2]], i64 [[IV]] +; NEON-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: ret i32 [[SELECT_DATA]] +; +; SVE-LABEL: define i32 @chained_select_for_csa_int_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA1]], i64 [[IV]] +; SVE-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]] +; SVE-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]] +; SVE-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA2]], i64 [[IV]] +; SVE-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: ret i32 [[SELECT_DATA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld1.addr = getelementptr inbounds i32, ptr %data1, i64 %iv + %ld1 = load i32, ptr %ld1.addr, align 4 + %select.cmp1 = icmp slt i32 %a, %ld1 + %select.ld1 = select i1 %select.cmp1, i32 %ld1, i32 %data.phi + %ld2.addr = getelementptr inbounds i32, ptr %data2, i64 %iv + %ld2 = load i32, ptr %ld2.addr, align 4 + %select.cmp2 = icmp sgt i32 %b, %ld2 + %select.data = select i1 %select.cmp2, i32 %ld2, i32 %select.ld1 + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll new file mode 100644 index 0000000000000..e802093fc7886 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll @@ -0,0 +1,123 @@ +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -scalable-vectorization=on -force-target-supports-scalable-vectors \ +; RUN: -disable-output 2>&1 < %s | FileCheck %s + + +; This function is derived from the following C program: +; int simple_csa_int_select(int N, int *data, int a) { +; int t = -1; +; for (int i = 0; i < N; i++) { +; if (a < data[i]) +; t = data[i]; +; } +; return t; +; } +define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + + +; CHECK: VPlan 'Initial VPlan for VF={vscale x 1},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in ir<%N> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9> +; CHECK-NEXT: LAST-ACTIVE-MASK-PHI vp<%4> = phi ir, vp<%8> +; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> +; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5> +; CHECK-NEXT: vp<%6> = vector-pointer ir<%ld.addr> +; CHECK-NEXT: WIDEN ir<%ld> = load vp<%6> +; CHECK-NEXT: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld> +; CHECK-NEXT: EMIT vp<%7> = any-of ir<%select.cmp> +; CHECK-NEXT: EMIT vp<%8> = select vp<%7>, ir<%select.cmp>, vp<%4> +; CHECK-NEXT: EMIT vp<%9> = select vp<%7>, ir<%ld>, ir<%data.phi> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%11> = extract-last-active vp<%9>, vp<%8>, ir<-1> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%N>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %select.data.lcssa = phi i32 [ %select.data, %loop ] (extra operand: vp<%11> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<-1>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv +; CHECK-NEXT: IR %ld = load i32, ptr %ld.addr, align 4 +; CHECK-NEXT: IR %select.cmp = icmp slt i32 %a, %ld +; CHECK-NEXT: IR %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi +; CHECK-NEXT: IR %iv.next = add nuw nsw i64 %iv, 1 +; CHECK-NEXT: IR %exit.cmp = icmp eq i64 %iv.next, %N +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +; CHECK: Cost of 1 for VF vscale x 1: induction instruction %iv.next = add nuw nsw i64 %iv, 1 +; CHECK-NEXT: Cost of 1 for VF vscale x 1: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction %exit.cmp = icmp eq i64 %iv.next, %N +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: LAST-ACTIVE-MASK-PHI vp<%4> = phi ir, vp<%8> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%6> = vector-pointer ir<%ld.addr> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%ld> = load vp<%6> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%7> = any-of ir<%select.cmp> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%8> = select vp<%7>, ir<%select.cmp>, vp<%4> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%9> = select vp<%7>, ir<%ld>, ir<%data.phi> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: vector loop backedge +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<-1>, ir-bb ] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %ld = load i32, ptr %ld.addr, align 4 +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.cmp = icmp slt i32 %a, %ld +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv.next = add nuw nsw i64 %iv, 1 +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %exit.cmp = icmp eq i64 %iv.next, %N +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%11> = extract-last-active vp<%9>, vp<%8>, ir<-1> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%cmp.n> = icmp eq ir<%N>, vp<%2> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.data.lcssa = phi i32 [ %select.data, %loop ] (extra operand: vp<%11> from middle.block) diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll index 70b1ea13677b8..98fe48f4509f0 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 -; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck --check-prefixes=CHECK,IC1VF4 %s -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck --check-prefixes=CHECK,IC4VF4 %s -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck --check-prefixes=CHECK,IC4VF1 %s +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck --check-prefix=IC1VF4 %s +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck --check-prefix=IC4VF4 %s +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck --check-prefix=IC4VF1 %s define i64 @select_decreasing_induction_icmp_const_start(ptr %a) { ; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_const_start( @@ -961,29 +961,142 @@ exit: ; preds = %loop } ; The unsigned sentinel value for decreasing-IV vectorization is ULONG_MAX, -; and since the IV hits this value, it is impossible to vectorize this case. +; and since the IV hits this value, it cannot be vectorized as a FindLastIV +; reduction. Instead, it is recognized and vectorized as a generic FindLast. ; In this test, %iv's range will include both signed and unsigned ; maximum (sentinel) values. -define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start) { -; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 -; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 -; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 -; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] -; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] -; CHECK-NEXT: ret i64 [[COND_LCSSA]] +define i64 @select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start) { +; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_out_of_bound( +; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; IC1VF4-NEXT: [[ENTRY:.*:]] +; IC1VF4-NEXT: br label %[[VECTOR_PH:.*]] +; IC1VF4: [[VECTOR_PH]]: +; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1VF4: [[VECTOR_BODY]]: +; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC1VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; IC1VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; IC1VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; IC1VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 +; IC1VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]] +; IC1VF4-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP8]] +; IC1VF4-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]]) +; IC1VF4-NEXT: [[TMP11]] = select i1 [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; IC1VF4-NEXT: [[TMP12]] = select i1 [[TMP10]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]] +; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC1VF4-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4 +; IC1VF4-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IC1VF4: [[MIDDLE_BLOCK]]: +; IC1VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC1VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP12]], <4 x i1> [[TMP11]], i64 [[TMP14]]) +; IC1VF4-NEXT: br label %[[SCALAR_PH:.*]] +; IC1VF4: [[SCALAR_PH]]: +; IC1VF4-NEXT: br label %[[LOOP:.*]] +; IC1VF4: [[LOOP]]: +; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP15]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; IC1VF4: [[EXIT]]: +; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC1VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_out_of_bound( +; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; IC4VF4-NEXT: [[ENTRY:.*:]] +; IC4VF4-NEXT: br label %[[VECTOR_PH:.*]] +; IC4VF4: [[VECTOR_PH]]: +; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC4VF4: [[VECTOR_BODY]]: +; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC4VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; IC4VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; IC4VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 +; IC4VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]] +; IC4VF4-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP8]] +; IC4VF4-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]]) +; IC4VF4-NEXT: [[TMP11]] = select i1 [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; IC4VF4-NEXT: [[TMP12]] = select i1 [[TMP10]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]] +; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC4VF4-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4 +; IC4VF4-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IC4VF4: [[MIDDLE_BLOCK]]: +; IC4VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC4VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP12]], <4 x i1> [[TMP11]], i64 [[TMP14]]) +; IC4VF4-NEXT: br label %[[SCALAR_PH:.*]] +; IC4VF4: [[SCALAR_PH]]: +; IC4VF4-NEXT: br label %[[LOOP:.*]] +; IC4VF4: [[LOOP]]: +; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP15]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; IC4VF4: [[EXIT]]: +; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC4VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_iv_out_of_bound( +; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: br label %[[LOOP:.*]] +; IC4VF1: [[LOOP]]: +; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] +; IC4VF1: [[EXIT]]: +; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC4VF1-NEXT: ret i64 [[COND_LCSSA]] ; entry: br label %loop @@ -1005,26 +1118,164 @@ exit: ret i64 %cond } -define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) { -; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 -; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 -; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 -; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] -; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] -; CHECK-NEXT: ret i64 [[COND_LCSSA]] +define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) { +; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start( +; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { +; IC1VF4-NEXT: [[ENTRY:.*]]: +; IC1VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; IC1VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1) +; IC1VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]] +; IC1VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; IC1VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1VF4: [[VECTOR_PH]]: +; IC1VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; IC1VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; IC1VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]] +; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; IC1VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1VF4: [[VECTOR_BODY]]: +; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC1VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 +; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; IC1VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 +; IC1VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; IC1VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]] +; IC1VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]] +; IC1VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]]) +; IC1VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; IC1VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]] +; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC1VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IC1VF4: [[MIDDLE_BLOCK]]: +; IC1VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC1VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]]) +; IC1VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; IC1VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1VF4: [[SCALAR_PH]]: +; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ] +; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC1VF4-NEXT: br label %[[LOOP:.*]] +; IC1VF4: [[LOOP]]: +; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; IC1VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 +; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] +; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 +; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] +; IC1VF4: [[EXIT]]: +; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ] +; IC1VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start( +; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { +; IC4VF4-NEXT: [[ENTRY:.*]]: +; IC4VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; IC4VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1) +; IC4VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]] +; IC4VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; IC4VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC4VF4: [[VECTOR_PH]]: +; IC4VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; IC4VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; IC4VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]] +; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; IC4VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC4VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC4VF4: [[VECTOR_BODY]]: +; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC4VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 +; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 +; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; IC4VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]] +; IC4VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]] +; IC4VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]]) +; IC4VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; IC4VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]] +; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC4VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC4VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IC4VF4: [[MIDDLE_BLOCK]]: +; IC4VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC4VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]]) +; IC4VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; IC4VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC4VF4: [[SCALAR_PH]]: +; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ] +; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC4VF4-NEXT: br label %[[LOOP:.*]] +; IC4VF4: [[LOOP]]: +; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; IC4VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 +; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] +; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 +; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] +; IC4VF4: [[EXIT]]: +; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ] +; IC4VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start( +; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { +; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: br label %[[LOOP:.*]] +; IC4VF1: [[LOOP]]: +; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC4VF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 +; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] +; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 +; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]] +; IC4VF1: [[EXIT]]: +; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC4VF1-NEXT: ret i64 [[COND_LCSSA]] ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll index c958ea7b9b88e..6da4869e1fe78 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll @@ -145,10 +145,44 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-LABEL: define i64 @select_icmp_nuw( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -157,9 +191,9 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-NEXT: [[INC]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[COND_LCSSA]] ; entry: @@ -186,10 +220,44 @@ define i64 @select_icmp_noflag(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-LABEL: define i64 @select_icmp_noflag( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -198,9 +266,9 @@ define i64 @select_icmp_noflag(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-NEXT: [[INC]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[COND_LCSSA]] ; entry: @@ -229,4 +297,8 @@ exit: ; preds = %for.body ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll index 72ed6537ef640..7a89c32b197d3 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll @@ -1,32 +1,156 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4 define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %iv_start ,i64 %n) { -; CHECK-LABEL: define i64 @select_non_const_iv_start_signed_guard( -; CHECK-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] -; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] -; CHECK: [[FOR_BODY_PREHEADER]]: -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3 -; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] -; CHECK: [[EXIT_LOOPEXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] -; CHECK-NEXT: ret i64 [[IDX_0_LCSSA]] +; CHECK-VF4IC1-LABEL: define i64 @select_non_const_iv_start_signed_guard( +; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]] +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC1: [[FOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3 +; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label %[[EXIT]] +; CHECK-VF4IC1: [[EXIT]]: +; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC1-NEXT: ret i64 [[IDX_0_LCSSA]] +; +; CHECK-VF4IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard( +; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]] +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]] +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3) +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC4: [[FOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3 +; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] +; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label %[[EXIT]] +; CHECK-VF4IC4: [[EXIT]]: +; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC4-NEXT: ret i64 [[IDX_0_LCSSA]] +; +; CHECK-VF1IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard( +; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { +; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF1IC4: [[FOR_BODY]]: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3 +; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] +; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: br label %[[EXIT]] +; CHECK-VF1IC4: [[EXIT]]: +; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF1IC4-NEXT: ret i64 [[IDX_0_LCSSA]] ; entry: %guard = icmp slt i64 %iv_start, %n @@ -49,32 +173,162 @@ exit: } define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, i32 %iv_start ,i32 %n) { -; CHECK-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( -; CHECK-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] -; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] -; CHECK: [[FOR_BODY_PREHEADER]]: -; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[RDX_07:%.*]] = phi i32 [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[RDX_07]] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] -; CHECK: [[EXIT_LOOPEXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] -; CHECK-NEXT: ret i32 [[IDX_0_LCSSA]] +; CHECK-VF4IC1-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( +; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { +; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 +; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]] +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP8]], <4 x i1> [[TMP7]], i32 [[TMP10]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC1: [[FOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 3 +; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[RDX_07]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label %[[EXIT]] +; CHECK-VF4IC1: [[EXIT]]: +; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC1-NEXT: ret i32 [[IDX_0_LCSSA]] +; +; CHECK-VF4IC4-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( +; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { +; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 +; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]] +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP8]], <4 x i1> [[TMP7]], i32 [[TMP10]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC4: [[FOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 3 +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[RDX_07]] +; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label %[[EXIT]] +; CHECK-VF4IC4: [[EXIT]]: +; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC4-NEXT: ret i32 [[IDX_0_LCSSA]] +; +; CHECK-VF1IC4-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( +; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { +; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 +; CHECK-VF1IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF1IC4: [[FOR_BODY]]: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i32 [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 3 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[RDX_07]] +; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: br label %[[EXIT]] +; CHECK-VF1IC4: [[EXIT]]: +; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF1IC4-NEXT: ret i32 [[IDX_0_LCSSA]] ; entry: %guard = icmp slt i32 %iv_start, %n @@ -101,3 +355,18 @@ exit: %idx.0.lcssa = phi i32 [ %rdx_start, %entry ], [ %cond, %for.body ] ret i32 %idx.0.lcssa } +;. +; CHECK-VF4IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF4IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF4IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF4IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-VF4IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VF4IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. +; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF4IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF4IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-VF4IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VF4IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll index b991d58eb2b8d..3c6740a922b69 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll @@ -674,65 +674,125 @@ exit: ; preds = %for.body ; Negative tests -; This test can theoretically be vectorized, but only with a runtime-check. -; The construct that are introduced by IndVarSimplify is: +; This test can theoretically be vectorized as a FindLastIV reduction, but only +; with a runtime-check. It will vectorize as a generic FindLast reduction. +; +; For FindLastIV, the construct that are introduced by IndVarSimplify is: ; %1 = trunc i64 %iv to i32 ; However, the loop guard is an i64: ; %cmp.sgt = icmp sgt i64 %n, 0 ; We cannot guarantee that %iv won't overflow an i32 value (and hence hit the ; sentinel value), and need a runtime-check to vectorize this case. -define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) { -; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit( +define i32 @select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) { +; CHECK-VF4IC1-LABEL: define i32 @select_icmp_const_truncated_iv_unwidened_exit( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC1-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0 ; CHECK-VF4IC1-NEXT: br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: br label %[[EXIT]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[RDX_LCSSA]] ; -; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit( +; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unwidened_exit( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC4-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0 ; CHECK-VF4IC4-NEXT: br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP8]], 3 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP9]], i32 [[RDX]] ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: br label %[[EXIT]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[RDX_LCSSA]] ; -; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit( +; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unwidened_exit( ; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { ; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF1IC4-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0 @@ -778,67 +838,127 @@ exit: ; preds = %for.body, %entry ret i32 %rdx.lcssa } -; This test can theoretically be vectorized, but only with a runtime-check. -; The construct that are introduced by IndVarSimplify is: +; This test can theoretically be vectorized as a FindLastIV reduction, but only +; with a runtime-check. It will vectorize as a generic FindLast reduction. +; +; For FindLastIV, the construct that are introduced by IndVarSimplify is: ; %1 = trunc i64 %iv to i32 ; However, the loop guard is unsigned: ; %cmp.not = icmp eq i32 %n, 0 ; We cannot guarantee that %iv won't overflow an i32 value (and hence hit the ; sentinel value), and need a runtime-check to vectorize this case. -define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) { -; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard( +define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) { +; CHECK-VF4IC1-LABEL: define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC1-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0 ; CHECK-VF4IC1-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]] ; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: br label %[[EXIT]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[RDX_LCSSA]] ; -; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard( +; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC4-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0 ; CHECK-VF4IC4-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]] ; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP8]], 3 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP9]], i32 [[RDX]] ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: br label %[[EXIT]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[RDX_LCSSA]] ; -; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard( +; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard( ; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) { ; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF1IC4-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0 @@ -899,41 +1019,61 @@ exit: ; preds = %for.body, %entry define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) { ; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: -; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] -; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = add i64 4294967294, [[INDEX]] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC1-NEXT: [[CONV:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 -; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806 -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512 +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC1-NEXT: br label %[[EXIT:.*]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] +; CHECK-VF4IC1-NEXT: ret i32 [[TMP7]] ; ; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: -; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] -; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = add i64 4294967294, [[INDEX]] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC4-NEXT: [[CONV:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 -; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806 -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512 +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC4-NEXT: br label %[[EXIT:.*]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] +; CHECK-VF4IC4-NEXT: ret i32 [[TMP7]] ; ; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound( ; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) { @@ -980,44 +1120,112 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 [[TMP8]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[COND_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 [[TMP8]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[COND_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard( @@ -1071,38 +1279,84 @@ exit: ; preds = %for.body define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) { ; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648 +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 -1) +; CHECK-VF4IC1-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK-VF4IC1: [[SCALAR_PH]]: ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP7]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00 -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649 -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648 +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 -1) +; CHECK-VF4IC4-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK-VF4IC4: [[SCALAR_PH]]: ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP7]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00 -; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649 -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] @@ -1156,22 +1410,56 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p ; CHECK-VF4IC1-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> [[TMP6]], <4 x i1> [[TMP5]], i16 [[TMP8]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i16 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i16 ; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: br label %[[EXIT]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] @@ -1184,22 +1472,56 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p ; CHECK-VF4IC4-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> [[TMP6]], <4 x i1> [[TMP5]], i16 [[TMP8]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i16 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i16 ; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: br label %[[EXIT]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll index 91c83103bf8f2..c6b0572973302 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll @@ -1948,16 +1948,52 @@ exit: ; preds = %for.body } ; The sentinel value for increasing-IV vectorization is -LONG_MAX, and since -; the IV hits this value, it is impossible to vectorize this case. +; the IV hits this value, it is vectorized as a generic last-active reduction. define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start, i64 %n) { ; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]] ; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]] @@ -1967,19 +2003,55 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx. ; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1 ; CHECK-VF4IC1-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]] ; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]] @@ -1989,9 +2061,9 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx. ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1 ; CHECK-VF4IC4-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound( @@ -2042,10 +2114,50 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]] +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -2054,18 +2166,58 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]] +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]] +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -2074,9 +2226,9 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value( diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll index e4922d3e4f627..8b62693de87fc 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -1128,27 +1128,124 @@ exit: ; preds = %loop ret float %sel } -; We don't support selecting loop-variant values. define i32 @select_variant_i32_from_icmp(ptr %v1, ptr %v2, i64 %n) { -; CHECK-LABEL: define i32 @select_variant_i32_from_icmp( -; CHECK-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] -; CHECK-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 -; CHECK-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] -; CHECK-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 -; CHECK-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 -; CHECK-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ] -; CHECK-NEXT: ret i32 [[SEL_LCSSA]] +; CHECK-VF4IC1-LABEL: define i32 @select_variant_i32_from_icmp( +; CHECK-VF4IC1-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 3), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 3) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: br label %[[LOOP:.*]] +; CHECK-VF4IC1: [[LOOP]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 +; CHECK-VF4IC1-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 +; CHECK-VF4IC1-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-VF4IC1: [[EXIT]]: +; CHECK-VF4IC1-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: ret i32 [[SEL_LCSSA]] +; +; CHECK-VF4IC4-LABEL: define i32 @select_variant_i32_from_icmp( +; CHECK-VF4IC4-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 3), %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD9]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]]) +; CHECK-VF4IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP27]] = select i1 [[TMP19]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD13]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP27]], <4 x i1> [[TMP23]], i32 3) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: br label %[[LOOP:.*]] +; CHECK-VF4IC4: [[LOOP]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-VF4IC4-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 +; CHECK-VF4IC4-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 +; CHECK-VF4IC4-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] +; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-VF4IC4: [[EXIT]]: +; CHECK-VF4IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: ret i32 [[SEL_LCSSA]] +; +; CHECK-VF1IC4-LABEL: define i32 @select_variant_i32_from_icmp( +; CHECK-VF1IC4-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: br label %[[LOOP:.*]] +; CHECK-VF1IC4: [[LOOP]]: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-VF1IC4-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 +; CHECK-VF1IC4-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 +; CHECK-VF1IC4-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 +; CHECK-VF1IC4-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] +; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-VF1IC4: [[EXIT]]: +; CHECK-VF1IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ] +; CHECK-VF1IC4-NEXT: ret i32 [[SEL_LCSSA]] ; entry: br label %loop @@ -1220,6 +1317,8 @@ exit: ; preds = %loop ; CHECK-VF4IC1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} ; CHECK-VF4IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK-VF4IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-VF4IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK-VF4IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} ;. ; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -1235,6 +1334,8 @@ exit: ; preds = %loop ; CHECK-VF4IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} ; CHECK-VF4IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK-VF4IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-VF4IC4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK-VF4IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} ;. ; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}