-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SCEV] Collect and merge loop guards through PHI nodes with multiple incoming values #113915
Conversation
@llvm/pr-subscribers-llvm-analysis Author: Julian Nagele (juliannagele) ChangesThis patch aims to strengthen collection of loop guards by processing PHI nodes with multiple incoming values as follows: collect guards for all incoming values/blocks and try to merge them into a single one for the PHI node. Full diff: https://github.com/llvm/llvm-project/pull/113915.diff 4 Files Affected:
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 179a2c38d9d3c2..cdc46cf24a0546 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1316,6 +1316,11 @@ class ScalarEvolution {
LoopGuards(ScalarEvolution &SE) : SE(SE) {}
+ static LoopGuards
+ collectFromBlock(ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
+ const BasicBlock *Block, const BasicBlock *Pred,
+ SmallPtrSet<const BasicBlock *, 8> VisitedBlocks);
+
public:
/// Collect rewrite map for loop guards for loop \p L, together with flags
/// indicating if NUW and NSW can be preserved during rewriting.
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index c939270ed39a65..d9cab0471ef0f3 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10648,7 +10648,7 @@ ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB)
if (const Loop *L = LI.getLoopFor(BB))
return {L->getLoopPredecessor(), L->getHeader()};
- return {nullptr, nullptr};
+ return {nullptr, BB};
}
/// SCEV structural equivalence is usually sufficient for testing whether two
@@ -15217,7 +15217,16 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
ScalarEvolution::LoopGuards
ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Pred = L->getLoopPredecessor();
LoopGuards Guards(SE);
+ return collectFromBlock(SE, Guards, Header, Pred, {});
+}
+
+ScalarEvolution::LoopGuards ScalarEvolution::LoopGuards::collectFromBlock(
+ ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
+ const BasicBlock *Block, const BasicBlock *Pred,
+ SmallPtrSet<const BasicBlock *, 8> VisitedBlocks) {
SmallVector<const SCEV *> ExprsToRewrite;
auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
const SCEV *RHS,
@@ -15556,14 +15565,13 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
}
};
- BasicBlock *Header = L->getHeader();
SmallVector<PointerIntPair<Value *, 1, bool>> Terms;
// First, collect information from assumptions dominating the loop.
for (auto &AssumeVH : SE.AC.assumptions()) {
if (!AssumeVH)
continue;
auto *AssumeI = cast<CallInst>(AssumeVH);
- if (!SE.DT.dominates(AssumeI, Header))
+ if (!SE.DT.dominates(AssumeI, Block))
continue;
Terms.emplace_back(AssumeI->getOperand(0), true);
}
@@ -15574,8 +15582,8 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
if (GuardDecl)
for (const auto *GU : GuardDecl->users())
if (const auto *Guard = dyn_cast<IntrinsicInst>(GU))
- if (Guard->getFunction() == Header->getParent() &&
- SE.DT.dominates(Guard, Header))
+ if (Guard->getFunction() == Block->getParent() &&
+ SE.DT.dominates(Guard, Block))
Terms.emplace_back(Guard->getArgOperand(0), true);
// Third, collect conditions from dominating branches. Starting at the loop
@@ -15583,11 +15591,10 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
// predecessors that can be found that have unique successors leading to the
// original header.
// TODO: share this logic with isLoopEntryGuardedByCond.
- for (std::pair<const BasicBlock *, const BasicBlock *> Pair(
- L->getLoopPredecessor(), Header);
- Pair.first;
+ std::pair<const BasicBlock *, const BasicBlock *> Pair(Pred, Block);
+ for (; Pair.first;
Pair = SE.getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
-
+ VisitedBlocks.insert(Pair.second);
const BranchInst *LoopEntryPredicate =
dyn_cast<BranchInst>(Pair.first->getTerminator());
if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional())
@@ -15596,6 +15603,66 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
Terms.emplace_back(LoopEntryPredicate->getCondition(),
LoopEntryPredicate->getSuccessor(0) == Pair.second);
}
+ // Finally, if we stopped climbing the predecessor chain because
+ // there wasn't a unique one to continue, try to collect conditions
+ // for PHINodes by recursively following all of their incoming
+ // blocks and try to merge the found conditions to build a new one
+ // for the Phi.
+ if (Pair.second->hasNPredecessorsOrMore(2)) {
+ for (auto &Phi : Pair.second->phis()) {
+ if (!SE.isSCEVable(Phi.getType()))
+ continue;
+
+ using MinMaxPattern = std::pair<const SCEVConstant *, SCEVTypes>;
+ auto GetMinMaxConst = [&SE, &VisitedBlocks, &Pair,
+ &Phi](unsigned int In) -> MinMaxPattern {
+ LoopGuards G(SE);
+ if (VisitedBlocks.insert(Phi.getIncomingBlock(In)).second)
+ collectFromBlock(SE, G, Pair.second, Phi.getIncomingBlock(In),
+ VisitedBlocks);
+ const SCEV *S = G.RewriteMap[SE.getSCEV(Phi.getIncomingValue(In))];
+ auto *SM = dyn_cast_if_present<SCEVMinMaxExpr>(S);
+ if (!SM)
+ return {nullptr, scCouldNotCompute};
+ if (const SCEVConstant *C0 = dyn_cast<SCEVConstant>(SM->getOperand(0)))
+ return {C0, SM->getSCEVType()};
+ if (const SCEVConstant *C1 = dyn_cast<SCEVConstant>(SM->getOperand(1)))
+ return {C1, SM->getSCEVType()};
+ return {nullptr, scCouldNotCompute};
+ };
+ auto MergeMinMaxConst = [](MinMaxPattern P1,
+ MinMaxPattern P2) -> MinMaxPattern {
+ auto [C1, T1] = P1;
+ auto [C2, T2] = P2;
+ if (!C1 || !C2 || T1 != T2)
+ return {nullptr, scCouldNotCompute};
+ switch (T1) {
+ case scUMaxExpr:
+ return {C1->getAPInt().ult(C2->getAPInt()) ? C1 : C2, T1};
+ case scSMaxExpr:
+ return {C1->getAPInt().slt(C2->getAPInt()) ? C1 : C2, T1};
+ case scUMinExpr:
+ return {C1->getAPInt().ugt(C2->getAPInt()) ? C1 : C2, T1};
+ case scSMinExpr:
+ return {C1->getAPInt().sgt(C2->getAPInt()) ? C1 : C2, T1};
+ default:
+ llvm_unreachable("Trying to merge non-MinMaxExpr SCEVs.");
+ }
+ };
+ auto P = GetMinMaxConst(0);
+ for (unsigned int In = 1; In < Phi.getNumIncomingValues(); In++) {
+ if (!P.first)
+ break;
+ P = MergeMinMaxConst(P, GetMinMaxConst(In));
+ }
+ if (P.first) {
+ const SCEV *LHS = SE.getSCEV(const_cast<PHINode *>(&Phi));
+ SmallVector<const SCEV *, 2> Ops({P.first, LHS});
+ const SCEV *RHS = SE.getMinMaxExpr(P.second, Ops);
+ Guards.RewriteMap.insert({LHS, RHS});
+ }
+ }
+ }
// Now apply the information from the collected conditions to
// Guards.RewriteMap. Conditions are processed in reverse order, so the
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count.ll b/llvm/test/Analysis/ScalarEvolution/trip-count.ll
index 8fc5b9b4096127..7304409814b0e1 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count.ll
@@ -211,3 +211,85 @@ for.body:
exit:
ret void
}
+
+define void @epilogue(i64 %count) {
+; CHECK-LABEL: 'epilogue'
+; CHECK-NEXT: Determining loop execution counts for: @epilogue
+; CHECK-NEXT: Loop %epilogue: backedge-taken count is (-1 + %count.epilogue)
+; CHECK-NEXT: Loop %epilogue: constant max backedge-taken count is i64 6
+; CHECK-NEXT: Loop %epilogue: symbolic max backedge-taken count is (-1 + %count.epilogue)
+; CHECK-NEXT: Loop %epilogue: Trip multiple is 1
+; CHECK-NEXT: Loop %while.body: backedge-taken count is ((-8 + %count) /u 8)
+; CHECK-NEXT: Loop %while.body: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT: Loop %while.body: symbolic max backedge-taken count is ((-8 + %count) /u 8)
+; CHECK-NEXT: Loop %while.body: Trip multiple is 1
+entry:
+ %cmp = icmp ugt i64 %count, 7
+ br i1 %cmp, label %while.body, label %epilogue.preheader
+
+while.body:
+ %iv = phi i64 [ %sub, %while.body ], [ %count, %entry ]
+ %sub = add i64 %iv, -8
+ %exitcond.not = icmp ugt i64 %sub, 7
+ br i1 %exitcond.not, label %while.body, label %while.loopexit
+
+while.loopexit:
+ %sub.exit = phi i64 [ %sub, %while.body ]
+ br label %epilogue.preheader
+
+epilogue.preheader:
+ %count.epilogue = phi i64 [ %count, %entry ], [ %sub.exit, %while.loopexit ]
+ %epilogue.cmp = icmp eq i64 %count.epilogue, 0
+ br i1 %epilogue.cmp, label %exit, label %epilogue
+
+epilogue:
+ %iv.epilogue = phi i64 [ %dec, %epilogue ], [ %count.epilogue, %epilogue.preheader ]
+ %dec = add i64 %iv.epilogue, -1
+ %exitcond.epilogue = icmp eq i64 %dec, 0
+ br i1 %exitcond.epilogue, label %exit, label %epilogue
+
+exit:
+ ret void
+
+}
+
+define void @epilogue2(i64 %count) {
+; CHECK-LABEL: 'epilogue2'
+; CHECK-NEXT: Determining loop execution counts for: @epilogue2
+; CHECK-NEXT: Loop %epilogue: backedge-taken count is (-1 + %count.epilogue)
+; CHECK-NEXT: Loop %epilogue: constant max backedge-taken count is i64 8
+; CHECK-NEXT: Loop %epilogue: symbolic max backedge-taken count is (-1 + %count.epilogue)
+; CHECK-NEXT: Loop %epilogue: Trip multiple is 1
+; CHECK-NEXT: Loop %while.body: backedge-taken count is ((-8 + %count) /u 8)
+; CHECK-NEXT: Loop %while.body: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT: Loop %while.body: symbolic max backedge-taken count is ((-8 + %count) /u 8)
+; CHECK-NEXT: Loop %while.body: Trip multiple is 1
+entry:
+ %cmp = icmp ugt i64 %count, 9
+ br i1 %cmp, label %while.body, label %epilogue.preheader
+
+while.body:
+ %iv = phi i64 [ %sub, %while.body ], [ %count, %entry ]
+ %sub = add i64 %iv, -8
+ %exitcond.not = icmp ugt i64 %sub, 7
+ br i1 %exitcond.not, label %while.body, label %while.loopexit
+
+while.loopexit:
+ %sub.exit = phi i64 [ %sub, %while.body ]
+ br label %epilogue.preheader
+
+epilogue.preheader:
+ %count.epilogue = phi i64 [ %count, %entry ], [ %sub.exit, %while.loopexit ]
+ %epilogue.cmp = icmp eq i64 %count.epilogue, 0
+ br i1 %epilogue.cmp, label %exit, label %epilogue
+
+epilogue:
+ %iv.epilogue = phi i64 [ %dec, %epilogue ], [ %count.epilogue, %epilogue.preheader ]
+ %dec = add i64 %iv.epilogue, -1
+ %exitcond.epilogue = icmp eq i64 %dec, 0
+ br i1 %exitcond.epilogue, label %exit, label %epilogue
+
+exit:
+ ret void
+
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll
index 70b002f766b753..966d7e3cded0ab 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll
@@ -41,7 +41,7 @@ define void @apply_delta(ptr nocapture noundef %dst, ptr nocapture noundef reado
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[DST_ADDR_130]], i64 1
; CHECK-NEXT: [[INCDEC_PTR8]] = getelementptr inbounds i8, ptr [[SRC_ADDR_129]], i64 1
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[DEC]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]]
; CHECK: while.end9:
; CHECK-NEXT: ret void
;
|
@llvm/pr-subscribers-llvm-transforms Author: Julian Nagele (juliannagele) ChangesThis patch aims to strengthen collection of loop guards by processing PHI nodes with multiple incoming values as follows: collect guards for all incoming values/blocks and try to merge them into a single one for the PHI node. Full diff: https://github.com/llvm/llvm-project/pull/113915.diff 4 Files Affected:
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 179a2c38d9d3c2..cdc46cf24a0546 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1316,6 +1316,11 @@ class ScalarEvolution {
LoopGuards(ScalarEvolution &SE) : SE(SE) {}
+ static LoopGuards
+ collectFromBlock(ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
+ const BasicBlock *Block, const BasicBlock *Pred,
+ SmallPtrSet<const BasicBlock *, 8> VisitedBlocks);
+
public:
/// Collect rewrite map for loop guards for loop \p L, together with flags
/// indicating if NUW and NSW can be preserved during rewriting.
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index c939270ed39a65..d9cab0471ef0f3 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10648,7 +10648,7 @@ ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB)
if (const Loop *L = LI.getLoopFor(BB))
return {L->getLoopPredecessor(), L->getHeader()};
- return {nullptr, nullptr};
+ return {nullptr, BB};
}
/// SCEV structural equivalence is usually sufficient for testing whether two
@@ -15217,7 +15217,16 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
ScalarEvolution::LoopGuards
ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Pred = L->getLoopPredecessor();
LoopGuards Guards(SE);
+ return collectFromBlock(SE, Guards, Header, Pred, {});
+}
+
+ScalarEvolution::LoopGuards ScalarEvolution::LoopGuards::collectFromBlock(
+ ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
+ const BasicBlock *Block, const BasicBlock *Pred,
+ SmallPtrSet<const BasicBlock *, 8> VisitedBlocks) {
SmallVector<const SCEV *> ExprsToRewrite;
auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
const SCEV *RHS,
@@ -15556,14 +15565,13 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
}
};
- BasicBlock *Header = L->getHeader();
SmallVector<PointerIntPair<Value *, 1, bool>> Terms;
// First, collect information from assumptions dominating the loop.
for (auto &AssumeVH : SE.AC.assumptions()) {
if (!AssumeVH)
continue;
auto *AssumeI = cast<CallInst>(AssumeVH);
- if (!SE.DT.dominates(AssumeI, Header))
+ if (!SE.DT.dominates(AssumeI, Block))
continue;
Terms.emplace_back(AssumeI->getOperand(0), true);
}
@@ -15574,8 +15582,8 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
if (GuardDecl)
for (const auto *GU : GuardDecl->users())
if (const auto *Guard = dyn_cast<IntrinsicInst>(GU))
- if (Guard->getFunction() == Header->getParent() &&
- SE.DT.dominates(Guard, Header))
+ if (Guard->getFunction() == Block->getParent() &&
+ SE.DT.dominates(Guard, Block))
Terms.emplace_back(Guard->getArgOperand(0), true);
// Third, collect conditions from dominating branches. Starting at the loop
@@ -15583,11 +15591,10 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
// predecessors that can be found that have unique successors leading to the
// original header.
// TODO: share this logic with isLoopEntryGuardedByCond.
- for (std::pair<const BasicBlock *, const BasicBlock *> Pair(
- L->getLoopPredecessor(), Header);
- Pair.first;
+ std::pair<const BasicBlock *, const BasicBlock *> Pair(Pred, Block);
+ for (; Pair.first;
Pair = SE.getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
-
+ VisitedBlocks.insert(Pair.second);
const BranchInst *LoopEntryPredicate =
dyn_cast<BranchInst>(Pair.first->getTerminator());
if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional())
@@ -15596,6 +15603,66 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
Terms.emplace_back(LoopEntryPredicate->getCondition(),
LoopEntryPredicate->getSuccessor(0) == Pair.second);
}
+ // Finally, if we stopped climbing the predecessor chain because
+ // there wasn't a unique one to continue, try to collect conditions
+ // for PHINodes by recursively following all of their incoming
+ // blocks and try to merge the found conditions to build a new one
+ // for the Phi.
+ if (Pair.second->hasNPredecessorsOrMore(2)) {
+ for (auto &Phi : Pair.second->phis()) {
+ if (!SE.isSCEVable(Phi.getType()))
+ continue;
+
+ using MinMaxPattern = std::pair<const SCEVConstant *, SCEVTypes>;
+ auto GetMinMaxConst = [&SE, &VisitedBlocks, &Pair,
+ &Phi](unsigned int In) -> MinMaxPattern {
+ LoopGuards G(SE);
+ if (VisitedBlocks.insert(Phi.getIncomingBlock(In)).second)
+ collectFromBlock(SE, G, Pair.second, Phi.getIncomingBlock(In),
+ VisitedBlocks);
+ const SCEV *S = G.RewriteMap[SE.getSCEV(Phi.getIncomingValue(In))];
+ auto *SM = dyn_cast_if_present<SCEVMinMaxExpr>(S);
+ if (!SM)
+ return {nullptr, scCouldNotCompute};
+ if (const SCEVConstant *C0 = dyn_cast<SCEVConstant>(SM->getOperand(0)))
+ return {C0, SM->getSCEVType()};
+ if (const SCEVConstant *C1 = dyn_cast<SCEVConstant>(SM->getOperand(1)))
+ return {C1, SM->getSCEVType()};
+ return {nullptr, scCouldNotCompute};
+ };
+ auto MergeMinMaxConst = [](MinMaxPattern P1,
+ MinMaxPattern P2) -> MinMaxPattern {
+ auto [C1, T1] = P1;
+ auto [C2, T2] = P2;
+ if (!C1 || !C2 || T1 != T2)
+ return {nullptr, scCouldNotCompute};
+ switch (T1) {
+ case scUMaxExpr:
+ return {C1->getAPInt().ult(C2->getAPInt()) ? C1 : C2, T1};
+ case scSMaxExpr:
+ return {C1->getAPInt().slt(C2->getAPInt()) ? C1 : C2, T1};
+ case scUMinExpr:
+ return {C1->getAPInt().ugt(C2->getAPInt()) ? C1 : C2, T1};
+ case scSMinExpr:
+ return {C1->getAPInt().sgt(C2->getAPInt()) ? C1 : C2, T1};
+ default:
+ llvm_unreachable("Trying to merge non-MinMaxExpr SCEVs.");
+ }
+ };
+ auto P = GetMinMaxConst(0);
+ for (unsigned int In = 1; In < Phi.getNumIncomingValues(); In++) {
+ if (!P.first)
+ break;
+ P = MergeMinMaxConst(P, GetMinMaxConst(In));
+ }
+ if (P.first) {
+ const SCEV *LHS = SE.getSCEV(const_cast<PHINode *>(&Phi));
+ SmallVector<const SCEV *, 2> Ops({P.first, LHS});
+ const SCEV *RHS = SE.getMinMaxExpr(P.second, Ops);
+ Guards.RewriteMap.insert({LHS, RHS});
+ }
+ }
+ }
// Now apply the information from the collected conditions to
// Guards.RewriteMap. Conditions are processed in reverse order, so the
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count.ll b/llvm/test/Analysis/ScalarEvolution/trip-count.ll
index 8fc5b9b4096127..7304409814b0e1 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-count.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count.ll
@@ -211,3 +211,85 @@ for.body:
exit:
ret void
}
+
+define void @epilogue(i64 %count) {
+; CHECK-LABEL: 'epilogue'
+; CHECK-NEXT: Determining loop execution counts for: @epilogue
+; CHECK-NEXT: Loop %epilogue: backedge-taken count is (-1 + %count.epilogue)
+; CHECK-NEXT: Loop %epilogue: constant max backedge-taken count is i64 6
+; CHECK-NEXT: Loop %epilogue: symbolic max backedge-taken count is (-1 + %count.epilogue)
+; CHECK-NEXT: Loop %epilogue: Trip multiple is 1
+; CHECK-NEXT: Loop %while.body: backedge-taken count is ((-8 + %count) /u 8)
+; CHECK-NEXT: Loop %while.body: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT: Loop %while.body: symbolic max backedge-taken count is ((-8 + %count) /u 8)
+; CHECK-NEXT: Loop %while.body: Trip multiple is 1
+entry:
+ %cmp = icmp ugt i64 %count, 7
+ br i1 %cmp, label %while.body, label %epilogue.preheader
+
+while.body:
+ %iv = phi i64 [ %sub, %while.body ], [ %count, %entry ]
+ %sub = add i64 %iv, -8
+ %exitcond.not = icmp ugt i64 %sub, 7
+ br i1 %exitcond.not, label %while.body, label %while.loopexit
+
+while.loopexit:
+ %sub.exit = phi i64 [ %sub, %while.body ]
+ br label %epilogue.preheader
+
+epilogue.preheader:
+ %count.epilogue = phi i64 [ %count, %entry ], [ %sub.exit, %while.loopexit ]
+ %epilogue.cmp = icmp eq i64 %count.epilogue, 0
+ br i1 %epilogue.cmp, label %exit, label %epilogue
+
+epilogue:
+ %iv.epilogue = phi i64 [ %dec, %epilogue ], [ %count.epilogue, %epilogue.preheader ]
+ %dec = add i64 %iv.epilogue, -1
+ %exitcond.epilogue = icmp eq i64 %dec, 0
+ br i1 %exitcond.epilogue, label %exit, label %epilogue
+
+exit:
+ ret void
+
+}
+
+define void @epilogue2(i64 %count) {
+; CHECK-LABEL: 'epilogue2'
+; CHECK-NEXT: Determining loop execution counts for: @epilogue2
+; CHECK-NEXT: Loop %epilogue: backedge-taken count is (-1 + %count.epilogue)
+; CHECK-NEXT: Loop %epilogue: constant max backedge-taken count is i64 8
+; CHECK-NEXT: Loop %epilogue: symbolic max backedge-taken count is (-1 + %count.epilogue)
+; CHECK-NEXT: Loop %epilogue: Trip multiple is 1
+; CHECK-NEXT: Loop %while.body: backedge-taken count is ((-8 + %count) /u 8)
+; CHECK-NEXT: Loop %while.body: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT: Loop %while.body: symbolic max backedge-taken count is ((-8 + %count) /u 8)
+; CHECK-NEXT: Loop %while.body: Trip multiple is 1
+entry:
+ %cmp = icmp ugt i64 %count, 9
+ br i1 %cmp, label %while.body, label %epilogue.preheader
+
+while.body:
+ %iv = phi i64 [ %sub, %while.body ], [ %count, %entry ]
+ %sub = add i64 %iv, -8
+ %exitcond.not = icmp ugt i64 %sub, 7
+ br i1 %exitcond.not, label %while.body, label %while.loopexit
+
+while.loopexit:
+ %sub.exit = phi i64 [ %sub, %while.body ]
+ br label %epilogue.preheader
+
+epilogue.preheader:
+ %count.epilogue = phi i64 [ %count, %entry ], [ %sub.exit, %while.loopexit ]
+ %epilogue.cmp = icmp eq i64 %count.epilogue, 0
+ br i1 %epilogue.cmp, label %exit, label %epilogue
+
+epilogue:
+ %iv.epilogue = phi i64 [ %dec, %epilogue ], [ %count.epilogue, %epilogue.preheader ]
+ %dec = add i64 %iv.epilogue, -1
+ %exitcond.epilogue = icmp eq i64 %dec, 0
+ br i1 %exitcond.epilogue, label %exit, label %epilogue
+
+exit:
+ ret void
+
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll
index 70b002f766b753..966d7e3cded0ab 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll
@@ -41,7 +41,7 @@ define void @apply_delta(ptr nocapture noundef %dst, ptr nocapture noundef reado
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[DST_ADDR_130]], i64 1
; CHECK-NEXT: [[INCDEC_PTR8]] = getelementptr inbounds i8, ptr [[SRC_ADDR_129]], i64 1
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[DEC]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]]
; CHECK: while.end9:
; CHECK-NEXT: ret void
;
|
…ltiple incoming Values
&Phi](unsigned int In) -> MinMaxPattern { | ||
LoopGuards G(SE); | ||
if (VisitedBlocks.insert(Phi.getIncomingBlock(In)).second) | ||
collectFromBlock(SE, G, Pair.second, Phi.getIncomingBlock(In), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should probably only allow a single level of recursion to start with, i.e. don't allow multiple predecessors after recursing here the first time
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done, added MaxLoopGuardCollectionDepth
and defaulted it to 1
It would be good to adjust the description to mention that this helps to determine tighter bounds on the trip counts of scalar tail loops after vectorization, helping to avoid unnecessary transforms on them (would also be good to link the motivating epilogue vectorization change) |
Haven't looked at the implementation yet, but this change has some compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=a256e89fd1b9f42d3f29df42d4f21ea823476ff3&to=24ca499a1d02f88bee077c2a3135dfae8f1094fe&stat=instructions:u |
…with multiple incoming Values
✅ With the latest revision this PR passed the C/C++ code formatter. |
Thanks for checking! I've limited the recursion depth to 1 as Florian suggested; that should hopefully help with compile times. |
This already cut down compile-time quite a bit for most configs but there are still some cases where it increases in the 0.06-0.11% range: https://llvm-compile-time-tracker.com/compare.php?from=724b432410fd59c63cc313d41824eda5ec84052f&to=a1c2970d6d6715bdf0c903ee7507c24a58310011&stat=instructions:u There's also some reductions for some workloads, plus a number of code changes. @juliannagele do you know if they are all due to vectorizers or something else? In order to bring down compile-time even more it might be good to further limit the # of predecessors we check in the recursive case. One surprising thing is that there's a notable change in compile-time in |
This is very weird. No idea what is happening there without profiling it. I did another run with this patch to see whether this might just be very unlucky noise, but that doesn't seem to be the case: https://llvm-compile-time-tracker.com/compare.php?from=dd116369f646d023a2e7e5c145a1bed5dcf9a45c&to=83ad7da76ee34ada80255cc4e39f99d2afd9e8d3&stat=instructions:u |
The latest commit adds this suggestion, which should (hopefully) help with the remaining compile time issues. |
if (Pair.second->hasNPredecessorsOrMore(2) && | ||
Depth < MaxLoopGuardCollectionDepth) { | ||
for (auto &Phi : Pair.second->phis()) { | ||
collectFromPHI(SE, Guards, Phi, VisitedBlocks, Depth); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If there are multiple phis , we collect the loop guards for all predecessors for each phi at the moment, I think? Can we instead just collect them once per predecessor?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes, thanks, the latest version should only collect guards once per predecessor.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With the latest updates, compile-time looks much better: https://llvm-compile-time-tracker.com/compare.php?from=a55248789ed3f653740e0723d016203b9d585f26&to=500e4c46e79f60b93b11a752698c520e345948e3&stat=instructions:u
@Nikci WDYT?
(As for O0, I tried building CTMark with assert(false)
in collectLoopGuards, so if the changes are real it might be down to something like getting unlucky with icache with the modified clang binary)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks given the latest compile-time numbers.
Would be good to wait to see if @nikic has any further comments (or concerns re compile-time)
if (!IncomingGuards.contains(InBlock)) { | ||
LoopGuards G(SE); | ||
collectFromBlock(SE, G, Phi.getParent(), InBlock, VisitedBlocks, | ||
Depth + 1); | ||
IncomingGuards.try_emplace(InBlock, std::move(G)); | ||
} | ||
const LoopGuards &G = IncomingGuards.at(InBlock); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Could try to avoid the repeated lookup by using IncomingGuards.insert()
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
…incoming values (llvm#113915) This patch aims to strengthen collection of loop guards by processing PHI nodes with multiple incoming values as follows: collect guards for all incoming values/blocks and try to merge them into a single one for the PHI node. The goal is to determine tighter bounds on the trip counts of scalar tail loops after vectorization, helping to avoid unnecessary transforms. In particular we'd like to avoid vectorizing scalar tails of hand-vectorized loops, for example in [Transforms/PhaseOrdering/X86/pr38280.ll](https://github.com/llvm/llvm-project/blob/231e03ba7e82896847dbc27d457dbb208f04699c/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll), discovered via llvm#108190 Compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=a55248789ed3f653740e0723d016203b9d585f26&to=500e4c46e79f60b93b11a752698c520e345948e3&stat=instructions:u PR: llvm#113915
/// Collect loop guards in \p Guards, starting from PHINode \p | ||
/// Phi, by calling \p collectFromBlock on the incoming blocks of | ||
/// \Phi and trying to merge the found constraints into a single | ||
/// combined on for \p Phi. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
on -> one
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should be fixed in feb9b37 (@juliannagele is OOO for few days, so I took care of it)
if (Inserted) | ||
collectFromBlock(SE, G->second, Phi.getParent(), InBlock, VisitedBlocks, | ||
Depth + 1); | ||
auto S = G->second.RewriteMap.find( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can early exit if RewriteMap is empty. This will save constructing the SCEV expression.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should be fixed in feb9b37 (@juliannagele is OOO for few days, so I took care of it)
return {nullptr, scCouldNotCompute}; | ||
if (const SCEVConstant *C0 = dyn_cast<SCEVConstant>(SM->getOperand(0))) | ||
return {C0, SM->getSCEVType()}; | ||
if (const SCEVConstant *C1 = dyn_cast<SCEVConstant>(SM->getOperand(1))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If SM has a SCEVConstant operand, it will always be operand 0, so the second check is redundant.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should be fixed in feb9b37 (@juliannagele is OOO for few days, so I took care of it)
Address post-commit comments for #113915.
…incoming values (llvm#113915) This patch aims to strengthen collection of loop guards by processing PHI nodes with multiple incoming values as follows: collect guards for all incoming values/blocks and try to merge them into a single one for the PHI node. The goal is to determine tighter bounds on the trip counts of scalar tail loops after vectorization, helping to avoid unnecessary transforms. In particular we'd like to avoid vectorizing scalar tails of hand-vectorized loops, for example in [Transforms/PhaseOrdering/X86/pr38280.ll](https://github.com/llvm/llvm-project/blob/231e03ba7e82896847dbc27d457dbb208f04699c/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll), discovered via llvm#108190 Compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=a55248789ed3f653740e0723d016203b9d585f26&to=500e4c46e79f60b93b11a752698c520e345948e3&stat=instructions:u PR: llvm#113915
… IC (#9666) * [SCEV] Collect and merge loop guards through PHI nodes with multiple incoming values (llvm#113915) This patch aims to strengthen collection of loop guards by processing PHI nodes with multiple incoming values as follows: collect guards for all incoming values/blocks and try to merge them into a single one for the PHI node. The goal is to determine tighter bounds on the trip counts of scalar tail loops after vectorization, helping to avoid unnecessary transforms. In particular we'd like to avoid vectorizing scalar tails of hand-vectorized loops, for example in [Transforms/PhaseOrdering/X86/pr38280.ll](https://github.com/llvm/llvm-project/blob/231e03ba7e82896847dbc27d457dbb208f04699c/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll), discovered via llvm#108190 Compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=a55248789ed3f653740e0723d016203b9d585f26&to=500e4c46e79f60b93b11a752698c520e345948e3&stat=instructions:u PR: llvm#113915 (cherry picked from commit 7c8e05a) * [SCEV] Address post-commit comments for llvm#113915. Address post-commit comments for llvm#113915. (cherry picked from commit feb9b37) * [LV] Vectorize Epilogues for loops with small VF but high IC (llvm#108190) - Consider MainLoopVF * IC when determining whether Epilogue Vectorization is profitable - Allow the same VF for the Epilogue as for the main loop - Use an upper bound for the trip count of the Epilogue when choosing the Epilogue VF PR: llvm#108190 --------- Co-authored-by: Florian Hahn <flo@fhahn.com> (cherry picked from commit a8538b9) --------- Co-authored-by: Florian Hahn <flo@fhahn.com>
This patch aims to strengthen collection of loop guards by processing PHI nodes with multiple incoming values as follows: collect guards for all incoming values/blocks and try to merge them into a single one for the PHI node.
The goal is to determine tighter bounds on the trip counts of scalar tail loops after vectorization, helping to avoid unnecessary transforms. In particular we'd like to avoid vectorizing scalar tails of hand-vectorized loops, for example in Transforms/PhaseOrdering/X86/pr38280.ll, discovered via #108190