Skip to content

Commit

Permalink
[cherrypick][LV] Vectorize Epilogues for loops with small VF but high…
Browse files Browse the repository at this point in the history
… IC (#9666)

* [SCEV] Collect and merge loop guards through PHI nodes with multiple incoming values (llvm#113915)

This patch aims to strengthen collection of loop guards by processing
PHI nodes with multiple incoming values as follows: collect guards for
all incoming values/blocks and try to merge them into a single one for
the PHI node.

The goal is to determine tighter bounds on the trip counts of scalar
tail loops after vectorization, helping to avoid unnecessary transforms.
In particular we'd like to avoid vectorizing scalar tails of
hand-vectorized loops, for example in
[Transforms/PhaseOrdering/X86/pr38280.ll](https://github.com/llvm/llvm-project/blob/231e03ba7e82896847dbc27d457dbb208f04699c/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll),
discovered via llvm#108190

Compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=a55248789ed3f653740e0723d016203b9d585f26&to=500e4c46e79f60b93b11a752698c520e345948e3&stat=instructions:u

PR: llvm#113915
(cherry picked from commit 7c8e05a)

* [SCEV] Address post-commit comments for llvm#113915.

Address post-commit comments for
llvm#113915.

(cherry picked from commit feb9b37)

* [LV] Vectorize Epilogues for loops with small VF but high IC (llvm#108190)

- Consider MainLoopVF * IC when determining whether Epilogue
Vectorization is profitable
- Allow the same VF for the Epilogue as for the main loop
- Use an upper bound for the trip count of the Epilogue when choosing
the Epilogue VF

PR: llvm#108190
---------

Co-authored-by: Florian Hahn <flo@fhahn.com>
(cherry picked from commit a8538b9)

---------

Co-authored-by: Florian Hahn <flo@fhahn.com>
  • Loading branch information
juliannagele and fhahn authored Dec 2, 2024
1 parent 42f3e8e commit f1b6330
Show file tree
Hide file tree
Showing 30 changed files with 3,960 additions and 1,529 deletions.
19 changes: 19 additions & 0 deletions llvm/include/llvm/Analysis/ScalarEvolution.h
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,25 @@ class ScalarEvolution {

LoopGuards(ScalarEvolution &SE) : SE(SE) {}

/// Recursively collect loop guards in \p Guards, starting from
/// block \p Block with predecessor \p Pred. The intended starting point
/// is to collect from a loop header and its predecessor.
static void
collectFromBlock(ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
const BasicBlock *Block, const BasicBlock *Pred,
SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks,
unsigned Depth = 0);

/// Collect loop guards in \p Guards, starting from PHINode \p
/// Phi, by calling \p collectFromBlock on the incoming blocks of
/// \Phi and trying to merge the found constraints into a single
/// combined one for \p Phi.
static void collectFromPHI(
ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
const PHINode &Phi, SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks,
SmallDenseMap<const BasicBlock *, LoopGuards> &IncomingGuards,
unsigned Depth);

public:
/// Collect rewrite map for loop guards for loop \p L, together with flags
/// indicating if NUW and NSW can be preserved during rewriting.
Expand Down
111 changes: 101 additions & 10 deletions llvm/lib/Analysis/ScalarEvolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ static cl::opt<unsigned> RangeIterThreshold(
cl::desc("Threshold for switching to iteratively computing SCEV ranges"),
cl::init(32));

static cl::opt<unsigned> MaxLoopGuardCollectionDepth(
"scalar-evolution-max-loop-guard-collection-depth", cl::Hidden,
cl::desc("Maximum depth for recrusive loop guard collection"), cl::init(1));

static cl::opt<bool>
ClassifyExpressions("scalar-evolution-classify-expressions",
cl::Hidden, cl::init(true),
Expand Down Expand Up @@ -10608,7 +10612,7 @@ ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB)
if (const Loop *L = LI.getLoopFor(BB))
return {L->getLoopPredecessor(), L->getHeader()};

return {nullptr, nullptr};
return {nullptr, BB};
}

/// SCEV structural equivalence is usually sufficient for testing whether two
Expand Down Expand Up @@ -15089,7 +15093,81 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,

ScalarEvolution::LoopGuards
ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
BasicBlock *Header = L->getHeader();
BasicBlock *Pred = L->getLoopPredecessor();
LoopGuards Guards(SE);
SmallPtrSet<const BasicBlock *, 8> VisitedBlocks;
collectFromBlock(SE, Guards, Header, Pred, VisitedBlocks);
return Guards;
}

void ScalarEvolution::LoopGuards::collectFromPHI(
ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
const PHINode &Phi, SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks,
SmallDenseMap<const BasicBlock *, LoopGuards> &IncomingGuards,
unsigned Depth) {
if (!SE.isSCEVable(Phi.getType()))
return;

using MinMaxPattern = std::pair<const SCEVConstant *, SCEVTypes>;
auto GetMinMaxConst = [&](unsigned IncomingIdx) -> MinMaxPattern {
const BasicBlock *InBlock = Phi.getIncomingBlock(IncomingIdx);
if (!VisitedBlocks.insert(InBlock).second)
return {nullptr, scCouldNotCompute};
auto [G, Inserted] = IncomingGuards.try_emplace(InBlock, LoopGuards(SE));
if (Inserted)
collectFromBlock(SE, G->second, Phi.getParent(), InBlock, VisitedBlocks,
Depth + 1);
auto &RewriteMap = G->second.RewriteMap;
if (RewriteMap.empty())
return {nullptr, scCouldNotCompute};
auto S = RewriteMap.find(SE.getSCEV(Phi.getIncomingValue(IncomingIdx)));
if (S == RewriteMap.end())
return {nullptr, scCouldNotCompute};
auto *SM = dyn_cast_if_present<SCEVMinMaxExpr>(S->second);
if (!SM)
return {nullptr, scCouldNotCompute};
if (const SCEVConstant *C0 = dyn_cast<SCEVConstant>(SM->getOperand(0)))
return {C0, SM->getSCEVType()};
return {nullptr, scCouldNotCompute};
};
auto MergeMinMaxConst = [](MinMaxPattern P1,
MinMaxPattern P2) -> MinMaxPattern {
auto [C1, T1] = P1;
auto [C2, T2] = P2;
if (!C1 || !C2 || T1 != T2)
return {nullptr, scCouldNotCompute};
switch (T1) {
case scUMaxExpr:
return {C1->getAPInt().ult(C2->getAPInt()) ? C1 : C2, T1};
case scSMaxExpr:
return {C1->getAPInt().slt(C2->getAPInt()) ? C1 : C2, T1};
case scUMinExpr:
return {C1->getAPInt().ugt(C2->getAPInt()) ? C1 : C2, T1};
case scSMinExpr:
return {C1->getAPInt().sgt(C2->getAPInt()) ? C1 : C2, T1};
default:
llvm_unreachable("Trying to merge non-MinMaxExpr SCEVs.");
}
};
auto P = GetMinMaxConst(0);
for (unsigned int In = 1; In < Phi.getNumIncomingValues(); In++) {
if (!P.first)
break;
P = MergeMinMaxConst(P, GetMinMaxConst(In));
}
if (P.first) {
const SCEV *LHS = SE.getSCEV(const_cast<PHINode *>(&Phi));
SmallVector<const SCEV *, 2> Ops({P.first, LHS});
const SCEV *RHS = SE.getMinMaxExpr(P.second, Ops);
Guards.RewriteMap.insert({LHS, RHS});
}
}

void ScalarEvolution::LoopGuards::collectFromBlock(
ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
const BasicBlock *Block, const BasicBlock *Pred,
SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks, unsigned Depth) {
SmallVector<const SCEV *> ExprsToRewrite;
auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
const SCEV *RHS,
Expand Down Expand Up @@ -15428,14 +15506,13 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
}
};

BasicBlock *Header = L->getHeader();
SmallVector<PointerIntPair<Value *, 1, bool>> Terms;
// First, collect information from assumptions dominating the loop.
for (auto &AssumeVH : SE.AC.assumptions()) {
if (!AssumeVH)
continue;
auto *AssumeI = cast<CallInst>(AssumeVH);
if (!SE.DT.dominates(AssumeI, Header))
if (!SE.DT.dominates(AssumeI, Block))
continue;
Terms.emplace_back(AssumeI->getOperand(0), true);
}
Expand All @@ -15446,27 +15523,42 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
if (GuardDecl)
for (const auto *GU : GuardDecl->users())
if (const auto *Guard = dyn_cast<IntrinsicInst>(GU))
if (Guard->getFunction() == Header->getParent() &&
SE.DT.dominates(Guard, Header))
if (Guard->getFunction() == Block->getParent() &&
SE.DT.dominates(Guard, Block))
Terms.emplace_back(Guard->getArgOperand(0), true);

// Third, collect conditions from dominating branches. Starting at the loop
// predecessor, climb up the predecessor chain, as long as there are
// predecessors that can be found that have unique successors leading to the
// original header.
// TODO: share this logic with isLoopEntryGuardedByCond.
for (std::pair<const BasicBlock *, const BasicBlock *> Pair(
L->getLoopPredecessor(), Header);
Pair.first;
std::pair<const BasicBlock *, const BasicBlock *> Pair(Pred, Block);
for (; Pair.first;
Pair = SE.getPredecessorWithUniqueSuccessorForBB(Pair.first)) {

VisitedBlocks.insert(Pair.second);
const BranchInst *LoopEntryPredicate =
dyn_cast<BranchInst>(Pair.first->getTerminator());
if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional())
continue;

Terms.emplace_back(LoopEntryPredicate->getCondition(),
LoopEntryPredicate->getSuccessor(0) == Pair.second);

// If we are recursively collecting guards stop after 2
// predecessors to limit compile-time impact for now.
if (Depth > 0 && Terms.size() == 2)
break;
}
// Finally, if we stopped climbing the predecessor chain because
// there wasn't a unique one to continue, try to collect conditions
// for PHINodes by recursively following all of their incoming
// blocks and try to merge the found conditions to build a new one
// for the Phi.
if (Pair.second->hasNPredecessorsOrMore(2) &&
Depth < MaxLoopGuardCollectionDepth) {
SmallDenseMap<const BasicBlock *, LoopGuards> IncomingGuards;
for (auto &Phi : Pair.second->phis())
collectFromPHI(SE, Guards, Phi, VisitedBlocks, IncomingGuards, Depth);
}

// Now apply the information from the collected conditions to
Expand Down Expand Up @@ -15523,7 +15615,6 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
Guards.RewriteMap.insert({Expr, Guards.rewrite(RewriteTo)});
}
}
return Guards;
}

const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,12 @@ class LoopVectorizationPlanner {
bool isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B) const;

/// Returns true if the per-lane cost of VectorizationFactor A is lower than
/// that of B in the context of vectorizing a loop with known \p MaxTripCount.
bool isMoreProfitable(const VectorizationFactor &A,
const VectorizationFactor &B,
const unsigned MaxTripCount) const;

/// Determines if we have the infrastructure to vectorize the loop and its
/// epilogue, assuming the main loop is vectorized by \p VF.
bool isCandidateForEpilogueVectorization(const ElementCount VF) const;
Expand Down
49 changes: 36 additions & 13 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1554,7 +1554,10 @@ class LoopVectorizationCostModel {
/// Returns true if epilogue vectorization is considered profitable, and
/// false otherwise.
/// \p VF is the vectorization factor chosen for the original loop.
bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
/// \p Multiplier is an aditional scaling factor applied to VF before
/// comparing to EpilogueVectorizationMinVF.
bool isEpilogueVectorizationProfitable(const ElementCount VF,
const unsigned Multiplier) const;

/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
Expand Down Expand Up @@ -4293,12 +4296,11 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
}

bool LoopVectorizationPlanner::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B) const {
const VectorizationFactor &A, const VectorizationFactor &B,
const unsigned MaxTripCount) const {
InstructionCost CostA = A.Cost;
InstructionCost CostB = B.Cost;

unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);

// Improve estimate for the vector width if it is scalable.
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
Expand Down Expand Up @@ -4347,6 +4349,13 @@ bool LoopVectorizationPlanner::isMoreProfitable(
return CmpFn(RTCostA, RTCostB);
}

bool LoopVectorizationPlanner::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B) const {
const unsigned MaxTripCount =
PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
}

static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
OptimizationRemarkEmitter *ORE,
Loop *TheLoop) {
Expand Down Expand Up @@ -4626,7 +4635,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
}

bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
const ElementCount VF) const {
const ElementCount VF, const unsigned Multiplier) const {
// FIXME: We need a much better cost-model to take different parameters such
// as register pressure, code size increase and cost of extra branches into
// account. For now we apply a very crude heuristic and only consider loops
Expand All @@ -4641,9 +4650,6 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
if (TTI.getMaxInterleaveFactor(VF) <= 1)
return false;

unsigned Multiplier = 1;
if (VF.isScalable())
Multiplier = getVScaleForTuning(TheLoop, TTI).value_or(1);
if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
return true;
return false;
Expand Down Expand Up @@ -4690,7 +4696,11 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
return Result;
}

if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) {
unsigned Multiplier = IC;
if (MainLoopVF.isScalable())
Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1);

if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) {
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
"this loop\n");
return Result;
Expand All @@ -4709,16 +4719,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
ScalarEvolution &SE = *PSE.getSE();
Type *TCType = Legal->getWidestInductionType();
const SCEV *RemainingIterations = nullptr;
unsigned MaxTripCount = 0;
for (auto &NextVF : ProfitableVFs) {
// Skip candidate VFs without a corresponding VPlan.
if (!hasPlanWithVF(NextVF.Width))
continue;

// Skip candidate VFs with widths >= the estimate runtime VF (scalable
// vectors) or the VF of the main loop (fixed vectors).
// Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
// vectors) or > the VF of the main loop (fixed vectors).
if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
ElementCount::isKnownGE(NextVF.Width, MainLoopVF))
(NextVF.Width.isScalable() &&
ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
(!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
continue;

// If NextVF is greater than the number of remaining iterations, the
Expand All @@ -4729,6 +4743,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
const SCEV *TC = createTripCountSCEV(TCType, PSE, OrigLoop);
RemainingIterations = SE.getURemExpr(
TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
SE.getConstant(TCType, MaxTripCount))) {
MaxTripCount =
SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();
}
LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
<< MaxTripCount << "\n");
}
if (SE.isKnownPredicate(
CmpInst::ICMP_UGT,
Expand All @@ -4737,7 +4759,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
continue;
}

if (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))
if (Result.Width.isScalar() ||
isMoreProfitable(NextVF, Result, MaxTripCount))
Result = NextVF;
}

Expand Down
Loading

0 comments on commit f1b6330

Please sign in to comment.