From be7152fe42ca8ccc9a33ae86353a348a84ce7bce Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 23 Aug 2024 14:37:20 +0100 Subject: [PATCH 1/2] LV: fix style after cursory reading (NFC) --- .../Transforms/Vectorize/LoopVectorize.cpp | 305 +++++++++--------- 1 file changed, 151 insertions(+), 154 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0200525a718d5..edd18ab4d8c87 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -71,7 +71,6 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -104,7 +103,6 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" -#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" @@ -126,12 +124,9 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" -#include "llvm/IR/ValueHandle.h" -#include "llvm/IR/VectorBuilder.h" #include "llvm/IR/Verifier.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/InstructionCost.h" @@ -148,12 +143,10 @@ #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include #include -#include #include #include #include #include -#include #include #include #include @@ -361,7 +354,7 @@ cl::opt EnableVPlanNativePath( "enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization.")); -} +} // namespace llvm // This flag enables the stress testing of the VPlan H-CFG construction in the // VPlan-native vectorization path. It must be used in conjuction with @@ -1190,8 +1183,8 @@ class LoopVectorizationCostModel { assert(VF.isVector() && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. /// But the cost will be assigned to one instruction only. - for (unsigned i = 0; i < Grp->getFactor(); ++i) { - if (auto *I = Grp->getMember(i)) { + for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) { + if (auto *I = Grp->getMember(Idx)) { if (Grp->getInsertPos() == I) WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); else @@ -1834,7 +1827,7 @@ class GeneratedRTChecks { /// un-linked from the IR and is added back during vector code generation. If /// there is no vector code generation, the check blocks are removed /// completely. - void Create(Loop *L, const LoopAccessInfo &LAI, + void create(Loop *L, const LoopAccessInfo &LAI, const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { // Hard cutoff to limit compile-time increase in case a very large number of @@ -2610,7 +2603,7 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( IRBuilder<> B(LoopVectorPreHeader->getTerminator()); // Fast-math-flags propagate from the original induction instruction. - if (II.getInductionBinOp() && isa(II.getInductionBinOp())) + if (isa_and_nonnull(II.getInductionBinOp())) B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(), @@ -2791,7 +2784,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, IRBuilder<> B(MiddleBlock->getTerminator()); // Fast-math-flags propagate from the original induction instruction. - if (II.getInductionBinOp() && isa(II.getInductionBinOp())) + if (isa_and_nonnull(II.getInductionBinOp())) B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); Value *CountMinusOne = B.CreateSub( @@ -2903,7 +2896,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, return ScalarCallCost; } -static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { +static Type *maybeVectorizeType(Type *Elt, ElementCount VF) { if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) return Elt; return VectorType::get(Elt, VF); @@ -2914,7 +2907,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); - Type *RetTy = MaybeVectorizeType(CI->getType(), VF); + Type *RetTy = maybeVectorizeType(CI->getType(), VF); FastMathFlags FMF; if (auto *FPMO = dyn_cast(CI)) FMF = FPMO->getFastMathFlags(); @@ -2924,7 +2917,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, SmallVector ParamTys; std::transform(FTy->param_begin(), FTy->param_end(), std::back_inserter(ParamTys), - [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); + [&](Type *Ty) { return maybeVectorizeType(Ty, VF); }); IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, dyn_cast(CI)); @@ -3016,7 +3009,7 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // Returns true if a given use occurs in the predicated block. Phi nodes use // their operands in their corresponding predecessor blocks. - auto isBlockOfUsePredicated = [&](Use &U) -> bool { + auto IsBlockOfUsePredicated = [&](Use &U) -> bool { auto *I = cast(U.getUser()); BasicBlock *BB = I->getParent(); if (auto *Phi = dyn_cast(I)) @@ -3060,7 +3053,7 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { // It's legal to sink the instruction if all its uses occur in the // predicated block. Otherwise, there's nothing to do yet, and we may // need to reanalyze the instruction. - if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) { + if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) { InstsToReanalyze.push_back(I); continue; } @@ -3088,9 +3081,9 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, PHINode *NewPhi = cast(State.get(VPPhi, 0)); // Make sure the builder has a valid insert point. Builder.SetInsertPoint(NewPhi); - for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { - VPValue *Inc = VPPhi->getIncomingValue(i); - VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i); + for (unsigned Idx = 0; Idx < VPPhi->getNumOperands(); ++Idx) { + VPValue *Inc = VPPhi->getIncomingValue(Idx); + VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx); NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]); } } @@ -3124,7 +3117,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // The pointer operands of loads and stores will be scalar as long as the // memory access is not a gather or scatter operation. The value operand of a // store will remain scalar if the store is scalarized. - auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { + auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) { InstWidening WideningDecision = getWideningDecision(MemAccess, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -3138,7 +3131,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // A helper that returns true if the given value is a getelementptr // instruction contained in the loop. - auto isLoopVaryingGEP = [&](Value *V) { + auto IsLoopVaryingGEP = [&](Value *V) { return isa(V) && !TheLoop->isLoopInvariant(V); }; @@ -3146,10 +3139,10 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // be a scalar use and the pointer is only used by memory accesses, we place // the pointer in ScalarPtrs. Otherwise, the pointer is placed in // PossibleNonScalarPtrs. - auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { + auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { // We only care about bitcast and getelementptr instructions contained in // the loop. - if (!isLoopVaryingGEP(Ptr)) + if (!IsLoopVaryingGEP(Ptr)) return; // If the pointer has already been identified as scalar (e.g., if it was @@ -3161,7 +3154,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { // If the use of the pointer will be a scalar use, and all users of the // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, // place the pointer in PossibleNonScalarPtrs. - if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { + if (IsScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) { return isa(U) || isa(U); })) ScalarPtrs.insert(I); @@ -3186,10 +3179,10 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { if (auto *Load = dyn_cast(&I)) { - evaluatePtrUse(Load, Load->getPointerOperand()); + EvaluatePtrUse(Load, Load->getPointerOperand()); } else if (auto *Store = dyn_cast(&I)) { - evaluatePtrUse(Store, Store->getPointerOperand()); - evaluatePtrUse(Store, Store->getValueOperand()); + EvaluatePtrUse(Store, Store->getPointerOperand()); + EvaluatePtrUse(Store, Store->getValueOperand()); } } for (auto *I : ScalarPtrs) @@ -3215,14 +3208,14 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { unsigned Idx = 0; while (Idx != Worklist.size()) { Instruction *Dst = Worklist[Idx++]; - if (!isLoopVaryingGEP(Dst->getOperand(0))) + if (!IsLoopVaryingGEP(Dst->getOperand(0))) continue; auto *Src = cast(Dst->getOperand(0)); if (llvm::all_of(Src->users(), [&](User *U) -> bool { auto *J = cast(U); return !TheLoop->contains(J) || Worklist.count(J) || ((isa(J) || isa(J)) && - isScalarUse(J, Src)); + IsScalarUse(J, Src)); })) { Worklist.insert(Src); LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); @@ -3247,7 +3240,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { return Induction.second.getKind() == InductionDescriptor::IK_PtrInduction && (isa(I) || isa(I)) && - Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar); + Indvar == getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar); }; // Determine if all users of the induction variable are scalar after @@ -3466,21 +3459,20 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( // losslessly cast all values to a common type. unsigned InterleaveFactor = Group->getFactor(); bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy); - for (unsigned i = 0; i < InterleaveFactor; i++) { - Instruction *Member = Group->getMember(i); + for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) { + Instruction *Member = Group->getMember(Idx); if (!Member) continue; auto *MemberTy = getLoadStoreType(Member); bool MemberNI = DL.isNonIntegralPointerType(MemberTy); // Don't coerce non-integral pointers to integers or vice versa. - if (MemberNI != ScalarNI) { + if (MemberNI != ScalarNI) // TODO: Consider adding special nullptr value case here return false; - } else if (MemberNI && ScalarNI && - ScalarTy->getPointerAddressSpace() != - MemberTy->getPointerAddressSpace()) { + if (MemberNI && ScalarNI && + ScalarTy->getPointerAddressSpace() != + MemberTy->getPointerAddressSpace()) return false; - } } // Check if masking is required. @@ -3561,7 +3553,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // Global values, params and instructions outside of current loop are out of // scope. - auto isOutOfScope = [&](Value *V) -> bool { + auto IsOutOfScope = [&](Value *V) -> bool { Instruction *I = dyn_cast(V); return (!I || !TheLoop->contains(I)); }; @@ -3573,8 +3565,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // that require predication must not be considered uniform after // vectorization, because that would create an erroneous replicating region // where only a single instance out of VF should be formed. - auto addToWorklistIfAllowed = [&](Instruction *I) -> void { - if (isOutOfScope(I)) { + auto AddToWorklistIfAllowed = [&](Instruction *I) -> void { + if (IsOutOfScope(I)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " << *I << "\n"); return; @@ -3597,13 +3589,13 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { for (BasicBlock *E : Exiting) { auto *Cmp = dyn_cast(E->getTerminator()->getOperand(0)); if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) - addToWorklistIfAllowed(Cmp); + AddToWorklistIfAllowed(Cmp); } auto PrevVF = VF.divideCoefficientBy(2); // Return true if all lanes perform the same memory operation, and we can // thus chose to execute only one. - auto isUniformMemOpUse = [&](Instruction *I) { + auto IsUniformMemOpUse = [&](Instruction *I) { // If the value was already known to not be uniform for the previous // (smaller VF), it cannot be uniform for the larger VF. if (PrevVF.isVector()) { @@ -3621,12 +3613,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { return TheLoop->isLoopInvariant(cast(I)->getValueOperand()); }; - auto isUniformDecision = [&](Instruction *I, ElementCount VF) { + auto IsUniformDecision = [&](Instruction *I, ElementCount VF) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); - if (isUniformMemOpUse(I)) + if (IsUniformMemOpUse(I)) return true; return (WideningDecision == CM_Widen || @@ -3637,11 +3629,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // Returns true if Ptr is the pointer operand of a memory access instruction // I, I is known to not require scalarization, and the pointer is not also // stored. - auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { + auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { if (isa(I) && I->getOperand(0) == Ptr) return false; return getLoadStorePointerOperand(I) == Ptr && - (isUniformDecision(I, VF) || Legal->isInvariant(Ptr)); + (IsUniformDecision(I, VF) || Legal->isInvariant(Ptr)); }; // Holds a list of values which are known to have at least one uniform use. @@ -3663,7 +3655,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: if (TheLoop->hasLoopInvariantOperands(&I)) - addToWorklistIfAllowed(&I); + AddToWorklistIfAllowed(&I); break; default: break; @@ -3673,9 +3665,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // ExtractValue instructions must be uniform, because the operands are // known to be loop-invariant. if (auto *EVI = dyn_cast(&I)) { - assert(isOutOfScope(EVI->getAggregateOperand()) && + assert(IsOutOfScope(EVI->getAggregateOperand()) && "Expected aggregate value to be loop invariant"); - addToWorklistIfAllowed(EVI); + AddToWorklistIfAllowed(EVI); continue; } @@ -3684,10 +3676,10 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (!Ptr) continue; - if (isUniformMemOpUse(&I)) - addToWorklistIfAllowed(&I); + if (IsUniformMemOpUse(&I)) + AddToWorklistIfAllowed(&I); - if (isVectorizedMemAccessUse(&I, Ptr)) + if (IsVectorizedMemAccessUse(&I, Ptr)) HasUniformUse.insert(Ptr); } @@ -3695,28 +3687,28 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // demanding) users. Since loops are assumed to be in LCSSA form, this // disallows uses outside the loop as well. for (auto *V : HasUniformUse) { - if (isOutOfScope(V)) + if (IsOutOfScope(V)) continue; auto *I = cast(V); auto UsersAreMemAccesses = llvm::all_of(I->users(), [&](User *U) -> bool { auto *UI = cast(U); - return TheLoop->contains(UI) && isVectorizedMemAccessUse(UI, V); + return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V); }); if (UsersAreMemAccesses) - addToWorklistIfAllowed(I); + AddToWorklistIfAllowed(I); } // Expand Worklist in topological order: whenever a new instruction // is added , its users should be already inside Worklist. It ensures // a uniform instruction will only be used by uniform instructions. - unsigned idx = 0; - while (idx != Worklist.size()) { - Instruction *I = Worklist[idx++]; + unsigned Idx = 0; + while (Idx != Worklist.size()) { + Instruction *I = Worklist[Idx++]; for (auto *OV : I->operand_values()) { // isOutOfScope operands cannot be uniform instructions. - if (isOutOfScope(OV)) + if (IsOutOfScope(OV)) continue; // First order recurrence Phi's should typically be considered // non-uniform. @@ -3728,9 +3720,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { auto *OI = cast(OV); if (llvm::all_of(OI->users(), [&](User *U) -> bool { auto *J = cast(U); - return Worklist.count(J) || isVectorizedMemAccessUse(J, OI); + return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI); })) - addToWorklistIfAllowed(OI); + AddToWorklistIfAllowed(OI); } } @@ -3750,7 +3742,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast(U); return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) || - isVectorizedMemAccessUse(I, Ind); + IsVectorizedMemAccessUse(I, Ind); }); if (!UniformInd) continue; @@ -3761,14 +3753,14 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { llvm::all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast(U); return I == Ind || !TheLoop->contains(I) || Worklist.count(I) || - isVectorizedMemAccessUse(I, IndUpdate); + IsVectorizedMemAccessUse(I, IndUpdate); }); if (!UniformIndUpdate) continue; // The induction variable and its update instruction will remain uniform. - addToWorklistIfAllowed(Ind); - addToWorklistIfAllowed(IndUpdate); + AddToWorklistIfAllowed(Ind); + AddToWorklistIfAllowed(IndUpdate); } Uniforms[VF].insert(Worklist.begin(), Worklist.end()); @@ -3918,8 +3910,8 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( if (UserVF.isScalable()) return FixedScalableVFPair( ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); - else - return UserVF; + + return UserVF; } assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); @@ -4706,11 +4698,10 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); if (hasPlanWithVF(ForcedEC)) return {ForcedEC, 0, 0}; - else { - LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " - "viable.\n"); - return Result; - } + + LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " + "viable.\n"); + return Result; } if (OrigLoop->getHeader()->getParent()->hasOptSize() || @@ -4900,8 +4891,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. - for (auto& pair : R.MaxLocalUsers) { - pair.second = std::max(pair.second, 1U); + for (auto &Pair : R.MaxLocalUsers) { + Pair.second = std::max(Pair.second, 1U); } // We calculate the interleave count using the following formula. @@ -4917,11 +4908,12 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // this currently happens when OptForSize, in which case IC is set to 1 above. unsigned IC = UINT_MAX; - for (auto& pair : R.MaxLocalUsers) { - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); + for (const auto &Pair : R.MaxLocalUsers) { + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(Pair.first); LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << " registers of " - << TTI.getRegisterClassName(pair.first) << " register class\n"); + << TTI.getRegisterClassName(Pair.first) + << " register class\n"); if (VF.isScalar()) { if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumScalarRegs; @@ -4929,10 +4921,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumVectorRegs; } - unsigned MaxLocalUsers = pair.second; + unsigned MaxLocalUsers = Pair.second; unsigned LoopInvariantRegs = 0; - if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) - LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; + if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end()) + LoopInvariantRegs = R.LoopInvariantRegs[Pair.first]; unsigned TmpIC = llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); @@ -5117,10 +5109,10 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // Interleave no less than SmallIC but not as aggressive as the normal IC // to satisfy the rare situation when resources are too limited. return std::max(IC / 2, SmallIC); - } else { - LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); - return SmallIC; } + + LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); + return SmallIC; } // Interleave if this is a large loop (small loops are already dealt with by @@ -5223,11 +5215,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); }; - for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { - Instruction *I = IdxToInstr[i]; + for (unsigned int Idx = 0, Sz = IdxToInstr.size(); Idx < Sz; ++Idx) { + Instruction *I = IdxToInstr[Idx]; // Remove all of the instructions that end at this location. - InstrList &List = TransposeEnds[i]; + InstrList &List = TransposeEnds[Idx]; for (Instruction *ToRemove : List) OpenIntervals.erase(ToRemove); @@ -5242,7 +5234,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { collectInLoopReductions(); // For each VF find the maximum usage of registers. - for (unsigned j = 0, e = VFs.size(); j < e; ++j) { + for (unsigned J = 0, E = VFs.size(); J < E; ++J) { // Count the number of registers used, per register class, given all open // intervals. // Note that elements in this SmallMapVector will be default constructed @@ -5250,7 +5242,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // there is no previous entry for ClassID. SmallMapVector RegUsage; - if (VFs[j].isScalar()) { + if (VFs[J].isScalar()) { for (auto *Inst : OpenIntervals) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); @@ -5259,12 +5251,12 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { RegUsage[ClassID] += 1; } } else { - collectUniformsAndScalars(VFs[j]); + collectUniformsAndScalars(VFs[J]); for (auto *Inst : OpenIntervals) { // Skip ignored values for VF > 1. if (VecValuesToIgnore.count(Inst)) continue; - if (isScalarAfterVectorization(Inst, VFs[j])) { + if (isScalarAfterVectorization(Inst, VFs[J])) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); // FIXME: The target might use more than one register for the type @@ -5273,25 +5265,25 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { } else { unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); - RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); + RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]); } } } - for (auto& pair : RegUsage) { - auto &Entry = MaxUsages[j][pair.first]; - Entry = std::max(Entry, pair.second); + for (const auto &Pair : RegUsage) { + auto &Entry = MaxUsages[J][Pair.first]; + Entry = std::max(Entry, Pair.second); } } - LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " + LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " << OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } - for (unsigned i = 0, e = VFs.size(); i < e; ++i) { + for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) { // Note that elements in this SmallMapVector will be default constructed // as 0. So we can use "Invariant[ClassID] += n" in the code below even if // there is no previous entry for ClassID. @@ -5303,20 +5295,20 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { bool IsScalar = all_of(Inst->users(), [&](User *U) { auto *I = cast(U); return TheLoop != LI->getLoopFor(I->getParent()) || - isScalarAfterVectorization(I, VFs[i]); + isScalarAfterVectorization(I, VFs[Idx]); }); - ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; + ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx]; unsigned ClassID = TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); } LLVM_DEBUG({ - dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; - dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() + dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n'; + dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size() << " item\n"; - for (const auto &pair : MaxUsages[i]) { + for (const auto &pair : MaxUsages[Idx]) { dbgs() << "LV(REG): RegisterClass: " << TTI.getRegisterClassName(pair.first) << ", " << pair.second << " registers\n"; @@ -5331,8 +5323,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { }); RU.LoopInvariantRegs = Invariant; - RU.MaxLocalUsers = MaxUsages[i]; - RUs[i] = RU; + RU.MaxLocalUsers = MaxUsages[Idx]; + RUs[Idx] = RU; } return RUs; @@ -5423,7 +5415,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( SmallVector Worklist; // Returns true if the given instruction can be scalarized. - auto canBeScalarized = [&](Instruction *I) -> bool { + auto CanBeScalarized = [&](Instruction *I) -> bool { // We only attempt to scalarize instructions forming a single-use chain // from the original predicated block that would otherwise be vectorized. // Although not strictly necessary, we give up on instructions we know will @@ -5499,7 +5491,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( if (auto *J = dyn_cast(U.get())) { assert(VectorType::isValidElementType(J->getType()) && "Instruction has non-scalar type"); - if (canBeScalarized(J)) + if (CanBeScalarized(J)) Worklist.push_back(J); else if (needsExtract(J, VF)) { ScalarCost += TTI.getScalarizationOverhead( @@ -5579,10 +5571,10 @@ static const SCEV *getAddressAccessSCEV( // We are looking for a gep with all loop invariant indices except for one // which should be an induction variable. - auto SE = PSE.getSE(); + auto *SE = PSE.getSE(); unsigned NumOperands = Gep->getNumOperands(); - for (unsigned i = 1; i < NumOperands; ++i) { - Value *Opd = Gep->getOperand(i); + for (unsigned Idx = 1; Idx < NumOperands; ++Idx) { + Value *Opd = Gep->getOperand(Idx); if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) && !Legal->isInductionVariable(Opd)) return nullptr; @@ -5601,7 +5593,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, return InstructionCost::getInvalid(); Type *ValTy = getLoadStoreType(I); - auto SE = PSE.getSE(); + auto *SE = PSE.getSE(); unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); @@ -5636,10 +5628,10 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch - auto *Vec_i1Ty = + auto *Veci1Ty = VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); Cost += TTI.getScalarizationOverhead( - Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), + Veci1Ty, APInt::getAllOnes(VF.getKnownMinValue()), /*Insert=*/false, /*Extract=*/true, CostKind); Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); @@ -5700,11 +5692,11 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, } StoreInst *SI = cast(I); - bool isLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); + bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) + - (isLoopInvariantStoreValue + (IsLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, CostKind, VF.getKnownMinValue() - 1)); @@ -5732,7 +5724,7 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - auto Group = getInterleavedAccessGroup(I); + const auto *Group = getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); @@ -5999,7 +5991,7 @@ InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( // any overhead. SmallVector Tys; for (auto *V : filterExtractingOperands(Ops, VF)) - Tys.push_back(MaybeVectorizeType(V->getType(), VF)); + Tys.push_back(maybeVectorizeType(V->getType(), VF)); return Cost + TTI.getOperandsScalarizationOverhead( filterExtractingOperands(Ops, VF), Tys, CostKind); } @@ -6023,7 +6015,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { NumPredStores++; if (Legal->isUniformMemOp(I, VF)) { - auto isLegalToScalarize = [&]() { + auto IsLegalToScalarize = [&]() { if (!VF.isScalable()) // Scalarization of fixed length vectors "just works". return true; @@ -6054,8 +6046,9 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract // FIXME: This cost is a significant under-estimate for tail folded // memory ops. - const InstructionCost ScalarizationCost = isLegalToScalarize() ? - getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); + const InstructionCost ScalarizationCost = + IsLegalToScalarize() ? getUniformMemOpCost(&I, VF) + : InstructionCost::getInvalid(); // Choose better solution for the current VF, Note that Invalid // costs compare as maximumal large. If both are invalid, we get @@ -6084,7 +6077,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { InstructionCost InterleaveCost = InstructionCost::getInvalid(); unsigned NumAccesses = 1; if (isAccessInterleaved(&I)) { - auto Group = getInterleavedAccessGroup(&I); + const auto *Group = getInterleavedAccessGroup(&I); assert(Group && "Fail to get an interleaved access group."); // Make one decision for the whole group. @@ -6122,7 +6115,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // If the instructions belongs to an interleave group, the whole group // receives the same decision. The whole group receives the cost, but // the cost will actually be assigned to one instruction. - if (auto Group = getInterleavedAccessGroup(&I)) + if (const auto *Group = getInterleavedAccessGroup(&I)) setWideningDecision(Group, VF, Decision, Cost); else setWideningDecision(&I, VF, Decision, Cost); @@ -6173,7 +6166,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { I, VF, CM_Scalarize, (VF.getKnownMinValue() * getMemoryInstructionCost(I, ElementCount::getFixed(1)))); - else if (auto Group = getInterleavedAccessGroup(I)) { + else if (const auto *Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) @@ -6385,10 +6378,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, Type *RetTy = I->getType(); if (canTruncateToMinimalBitwidth(I, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); - auto SE = PSE.getSE(); + auto *SE = PSE.getSE(); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - auto hasSingleCopyAfterVectorization = [this](Instruction *I, + auto HasSingleCopyAfterVectorization = [this](Instruction *I, ElementCount VF) -> bool { if (VF.isScalar()) return true; @@ -6402,7 +6395,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, return !Scalarized->second.count(UI); }); }; - (void) hasSingleCopyAfterVectorization; + (void)HasSingleCopyAfterVectorization; Type *VectorTy; if (isScalarAfterVectorization(I, VF)) { @@ -6415,7 +6408,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || - hasSingleCopyAfterVectorization(I, VF)); + HasSingleCopyAfterVectorization(I, VF)); VectorTy = RetTy; } else VectorTy = ToVectorTy(RetTy, VF); @@ -6452,19 +6445,21 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (VF.isScalable()) return InstructionCost::getInvalid(); // Return cost for branches around scalarized and predicated blocks. - auto *Vec_i1Ty = + auto *Veci1Ty = VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); return ( TTI.getScalarizationOverhead( - Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), + Veci1Ty, APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, /*Extract*/ true, CostKind) + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); - } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) + } + + if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. return TTI.getCFInstrCost(Instruction::Br, CostKind); - else - // This branch will be eliminated by if-conversion. - return 0; + + // This branch will be eliminated by if-conversion. + return 0; // Note: We currently assume zero cost for an unconditional branch inside // a predicated block since it will become a fall-through, although we // may decide in the future to call TTI for all branches. @@ -7057,9 +7052,9 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { buildVPlansWithVPRecipes(UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); return; - } else - reportVectorizationInfo("UserVF ignored because of invalid costs.", - "InvalidCost", ORE, OrigLoop); + } + reportVectorizationInfo("UserVF ignored because of invalid costs.", + "InvalidCost", ORE, OrigLoop); } } @@ -7397,7 +7392,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { return BestFactor; } -static void AddRuntimeUnrollDisableMetaData(Loop *L) { +static void addRuntimeUnrollDisableMetaData(Loop *L) { SmallVector MDs; // Reserve first location for self reference to the LoopID metadata node. MDs.push_back(nullptr); @@ -7405,14 +7400,14 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { MDNode *LoopID = L->getLoopID(); if (LoopID) { // First find existing loop unrolling disable metadata. - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - auto *MD = dyn_cast(LoopID->getOperand(i)); + for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) { + auto *MD = dyn_cast(LoopID->getOperand(I)); if (MD) { const auto *S = dyn_cast(MD->getOperand(0)); IsUnrollMetadata = S && S->getString().starts_with("llvm.loop.unroll.disable"); } - MDs.push_back(LoopID->getOperand(i)); + MDs.push_back(LoopID->getOperand(I)); } } @@ -7618,7 +7613,7 @@ LoopVectorizationPlanner::executePlan( TargetTransformInfo::UnrollingPreferences UP; TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) - AddRuntimeUnrollDisableMetaData(L); + addRuntimeUnrollDisableMetaData(L); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -8120,7 +8115,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, assert((isa(I) || isa(I)) && "Must be called with either a load or store"); - auto willWiden = [&](ElementCount VF) -> bool { + auto WillWiden = [&](ElementCount VF) -> bool { LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); assert(Decision != LoopVectorizationCostModel::CM_Unknown && @@ -8133,7 +8128,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, return Decision != LoopVectorizationCostModel::CM_Scalarize; }; - if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) + if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden, Range)) return nullptr; VPValue *Mask = nullptr; @@ -8220,7 +8215,7 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( // Determine whether \p K is a truncation based on an induction variable that // can be optimized. - auto isOptimizableIVTruncate = + auto IsOptimizableIVTruncate = [&](Instruction *K) -> std::function { return [=](ElementCount VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); @@ -8228,7 +8223,7 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( }; if (LoopVectorizationPlanner::getDecisionAndClampRange( - isOptimizableIVTruncate(I), Range)) { + IsOptimizableIVTruncate(I), Range)) { auto *Phi = cast(I->getOperand(0)); const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); @@ -8510,7 +8505,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, // First, check for specific widening recipes that deal with inductions, Phi // nodes, calls and memory operations. VPRecipeBase *Recipe; - if (auto Phi = dyn_cast(Instr)) { + if (auto *Phi = dyn_cast(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) return tryToBlend(Phi, Operands); @@ -8560,7 +8555,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (!shouldWiden(Instr, Range)) return nullptr; - if (auto GEP = dyn_cast(Instr)) + if (auto *GEP = dyn_cast(Instr)) return new VPWidenGEPRecipe(GEP, make_range(Operands.begin(), Operands.end())); @@ -8634,7 +8629,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, static MapVector collectUsersInExitBlock( Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan, const MapVector &Inductions) { - auto MiddleVPBB = + auto *MiddleVPBB = cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); // No edge from the middle block to the unique exit block has been inserted // and there is nothing to fix from vector loop; phis should have incoming @@ -8675,7 +8670,7 @@ addUsersInExitBlock(VPlan &Plan, if (ExitingValuesToFix.empty()) return; - auto MiddleVPBB = + auto *MiddleVPBB = cast(Plan.getVectorLoopRegion()->getSingleSuccessor()); BasicBlock *ExitBB = cast(MiddleVPBB->getSuccessors()[0])->getIRBasicBlock(); @@ -8915,7 +8910,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](ElementCount VF) -> bool { + auto ApplyIG = [IG, this](ElementCount VF) -> bool { bool Result = (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); @@ -8926,7 +8921,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { "Unsupported interleave factor for scalable vectors"); return Result; }; - if (!getDecisionAndClampRange(applyIG, Range)) + if (!getDecisionAndClampRange(ApplyIG, Range)) continue; InterleaveGroups.insert(IG); }; @@ -9973,7 +9968,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. if (VF.Width.isVector() || SelectedIC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); // Check if it is profitable to vectorize with runtime checks. bool ForceVectorization = @@ -10055,7 +10050,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { << IntDiagMsg.second; }); return false; - } else if (!VectorizeLoop && InterleaveLoop) { + } + + if (!VectorizeLoop && InterleaveLoop) { LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); ORE->emit([&]() { return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, @@ -10230,7 +10227,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { L->setLoopID(*RemainderLoopID); } else { if (DisableRuntimeUnroll) - AddRuntimeUnrollDisableMetaData(L); + addRuntimeUnrollDisableMetaData(L); // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); From 0aa4f704e0e85d8bcb1e71a826dfa5926d014080 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 6 Sep 2024 16:11:21 +0100 Subject: [PATCH 2/2] LV: fix nit VecI1Ty --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index edd18ab4d8c87..2a4932a944c87 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5628,10 +5628,10 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch - auto *Veci1Ty = + auto *VecI1Ty = VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); Cost += TTI.getScalarizationOverhead( - Veci1Ty, APInt::getAllOnes(VF.getKnownMinValue()), + VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()), /*Insert=*/false, /*Extract=*/true, CostKind); Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); @@ -6445,11 +6445,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (VF.isScalable()) return InstructionCost::getInvalid(); // Return cost for branches around scalarized and predicated blocks. - auto *Veci1Ty = + auto *VecI1Ty = VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); return ( TTI.getScalarizationOverhead( - Veci1Ty, APInt::getAllOnes(VF.getFixedValue()), + VecI1Ty, APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, /*Extract*/ true, CostKind) + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); }