@@ -1194,6 +1194,8 @@ using InstructionVFPair = std::pair<Instruction *, ElementCount>;
11941194// / TargetTransformInfo to query the different backends for the cost of
11951195// / different operations.
11961196class LoopVectorizationCostModel {
1197+ friend class LoopVectorizationPlanner ;
1198+
11971199public:
11981200 LoopVectorizationCostModel (ScalarEpilogueLowering SEL, Loop *L,
11991201 PredicatedScalarEvolution &PSE, LoopInfo *LI,
@@ -5352,7 +5354,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
53525354 ? Candidate.Width .getKnownMinValue () * AssumedMinimumVscale
53535355 : Candidate.Width .getFixedValue ();
53545356 LLVM_DEBUG (dbgs () << " LV: Vector loop of width " << i
5355- << " costs: " << ( Candidate.Cost / Width) );
5357+ << " costs: " << Candidate.Cost / Width);
53565358 if (i.isScalable ())
53575359 LLVM_DEBUG (dbgs () << " (assuming a minimum vscale of "
53585360 << AssumedMinimumVscale << " )" );
@@ -7623,6 +7625,108 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
76237625 return VF;
76247626}
76257627
7628+ InstructionCost LoopVectorizationPlanner::computeCost (VPlan &Plan,
7629+ ElementCount VF) {
7630+ InstructionCost Cost = 0 ;
7631+
7632+ VPBasicBlock *Header =
7633+ cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getEntry ());
7634+
7635+ // Cost modeling for inductions is inaccurate in the legacy cost model. Try as
7636+ // to match it here initially during VPlan cost model bring up:
7637+ // * VPWidenIntOrFpInductionRecipes implement computeCost,
7638+ // * VPWidenPointerInductionRecipe costs seem to be 0 in the legacy cost model
7639+ // * other inductions only have a cost of 1 (i.e. the cost of the scalar
7640+ // induction increment).
7641+ unsigned NumWideIVs = count_if (Header->phis (), [](VPRecipeBase &R) {
7642+ return isa<VPWidenPointerInductionRecipe>(&R) ||
7643+ (isa<VPWidenIntOrFpInductionRecipe>(&R) &&
7644+ !cast<VPWidenIntOrFpInductionRecipe>(&R)->getTruncInst ());
7645+ });
7646+ Cost += Legal->getInductionVars ().size () - NumWideIVs;
7647+
7648+ for (VPBlockBase *Block : to_vector (vp_depth_first_shallow (Header))) {
7649+ if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
7650+ assert (Region->isReplicator ());
7651+ VPBasicBlock *Then =
7652+ cast<VPBasicBlock>(Region->getEntry ()->getSuccessors ()[0 ]);
7653+ for (VPRecipeBase &R : *Then) {
7654+ if (isa<VPInstruction, VPScalarIVStepsRecipe>(&R))
7655+ continue ;
7656+ auto *RepR = cast<VPReplicateRecipe>(&R);
7657+ Cost += CM.getInstructionCost (RepR->getUnderlyingInstr (), VF).first ;
7658+ }
7659+ continue ;
7660+ }
7661+
7662+ VPCostContext Ctx (CM.TTI , OrigLoop->getHeader ()->getContext ());
7663+ for (VPRecipeBase &R : *cast<VPBasicBlock>(Block)) {
7664+ InstructionCost RecipeCost = R.computeCost (VF, Ctx);
7665+ if (!RecipeCost.isValid ()) {
7666+ if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
7667+ RecipeCost = CM.getInstructionCost (IG->getInsertPos (), VF).first ;
7668+ } else if (auto *WidenMem =
7669+ dyn_cast<VPWidenMemoryInstructionRecipe>(&R)) {
7670+ RecipeCost =
7671+ CM.getInstructionCost (&WidenMem->getIngredient (), VF).first ;
7672+ } else if (auto *I = dyn_cast_or_null<Instruction>(
7673+ R.getVPSingleValue ()->getUnderlyingValue ()))
7674+ RecipeCost = CM.getInstructionCost (I, VF).first ;
7675+ else
7676+ continue ;
7677+ }
7678+ if (ForceTargetInstructionCost.getNumOccurrences () > 0 )
7679+ Cost = InstructionCost (ForceTargetInstructionCost);
7680+
7681+ LLVM_DEBUG ({
7682+ dbgs () << " Cost of " << RecipeCost << " for " << VF << " : " ;
7683+ R.dump ();
7684+ });
7685+ Cost += RecipeCost;
7686+ }
7687+ }
7688+ Cost += 1 ;
7689+ LLVM_DEBUG (dbgs () << " Cost for " << VF << " : " << Cost << " \n " );
7690+ return Cost;
7691+ }
7692+
7693+ std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan () {
7694+ // If there is a single VPlan with a single VF, return it directly.
7695+ if (VPlans.size () == 1 && size (VPlans[0 ]->vectorFactors ()) == 1 ) {
7696+ ElementCount VF = *VPlans[0 ]->vectorFactors ().begin ();
7697+ return {*VPlans[0 ], VF};
7698+ }
7699+
7700+ VPlan *BestPlan = &*VPlans[0 ];
7701+ assert (hasPlanWithVF (ElementCount::getFixed (1 )));
7702+ ElementCount BestVF = ElementCount::getFixed (1 );
7703+ InstructionCost ScalarCost = computeCost (
7704+ getBestPlanFor (ElementCount::getFixed (1 )), ElementCount::getFixed (1 ));
7705+ InstructionCost BestCost = ScalarCost;
7706+ bool ForceVectorization = Hints.getForce () == LoopVectorizeHints::FK_Enabled;
7707+ if (ForceVectorization) {
7708+ // Ignore scalar width, because the user explicitly wants vectorization.
7709+ // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7710+ // evaluation.
7711+ BestCost = InstructionCost::getMax ();
7712+ }
7713+
7714+ for (auto &P : VPlans) {
7715+ for (ElementCount VF : P->vectorFactors ()) {
7716+ if (VF.isScalar ())
7717+ continue ;
7718+ InstructionCost Cost = computeCost (*P, VF);
7719+ if (isMoreProfitable (VectorizationFactor (VF, Cost, ScalarCost),
7720+ VectorizationFactor (BestVF, BestCost, ScalarCost))) {
7721+ BestCost = Cost;
7722+ BestVF = VF;
7723+ BestPlan = &*P;
7724+ }
7725+ }
7726+ }
7727+ return {*BestPlan, BestVF};
7728+ }
7729+
76267730VPlan &LoopVectorizationPlanner::getBestPlanFor (ElementCount VF) const {
76277731 assert (count_if (VPlans,
76287732 [VF](const VPlanPtr &Plan) { return Plan->hasVF (VF); }) ==
@@ -10245,8 +10349,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1024510349 VF.MinProfitableTripCount , IC, &LVL, &CM, BFI,
1024610350 PSI, Checks);
1024710351
10248- VPlan &BestPlan = LVP.getBestPlanFor (VF.Width );
10249- LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
10352+ const auto &[BestPlan, Width] = LVP.getBestPlan ();
10353+ LLVM_DEBUG (dbgs () << " VF picked by VPlan cost model: " << Width
10354+ << " \n " );
10355+ assert (VF.Width == Width &&
10356+ " VPlan cost model and legacy cost model disagreed" );
10357+ LVP.executePlan (Width, IC, BestPlan, LB, DT, false );
1025010358 ++LoopsVectorized;
1025110359
1025210360 // Add metadata to disable runtime unrolling a scalar loop when there
0 commit comments