From e403d25200ad75d1e65ee1c5ba2bfe64a7b6b5ab Mon Sep 17 00:00:00 2001 From: Julian Nagele Date: Fri, 15 Nov 2024 10:03:08 +0000 Subject: [PATCH] [SCEV] Collect and merge loop guards through PHI nodes with multiple incoming values (#113915) This patch aims to strengthen collection of loop guards by processing PHI nodes with multiple incoming values as follows: collect guards for all incoming values/blocks and try to merge them into a single one for the PHI node. The goal is to determine tighter bounds on the trip counts of scalar tail loops after vectorization, helping to avoid unnecessary transforms. In particular we'd like to avoid vectorizing scalar tails of hand-vectorized loops, for example in [Transforms/PhaseOrdering/X86/pr38280.ll](https://github.com/llvm/llvm-project/blob/231e03ba7e82896847dbc27d457dbb208f04699c/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll), discovered via https://github.com/llvm/llvm-project/pull/108190 Compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=a55248789ed3f653740e0723d016203b9d585f26&to=500e4c46e79f60b93b11a752698c520e345948e3&stat=instructions:u PR: https://github.com/llvm/llvm-project/pull/113915 --- llvm/include/llvm/Analysis/ScalarEvolution.h | 19 ++ llvm/lib/Analysis/ScalarEvolution.cpp | 111 ++++++- ...t-guard-info-with-multiple-predecessors.ll | 279 ++++++++++++++++++ .../Transforms/PhaseOrdering/X86/pr38280.ll | 2 +- 4 files changed, 400 insertions(+), 11 deletions(-) create mode 100644 llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 179a2c38d9d3c2..c254443e6ed018 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -1316,6 +1316,25 @@ class ScalarEvolution { LoopGuards(ScalarEvolution &SE) : SE(SE) {} + /// Recursively collect loop guards in \p Guards, starting from + /// block \p Block with predecessor \p Pred. The intended starting point + /// is to collect from a loop header and its predecessor. + static void + collectFromBlock(ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards, + const BasicBlock *Block, const BasicBlock *Pred, + SmallPtrSetImpl &VisitedBlocks, + unsigned Depth = 0); + + /// Collect loop guards in \p Guards, starting from PHINode \p + /// Phi, by calling \p collectFromBlock on the incoming blocks of + /// \Phi and trying to merge the found constraints into a single + /// combined on for \p Phi. + static void collectFromPHI( + ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards, + const PHINode &Phi, SmallPtrSetImpl &VisitedBlocks, + SmallDenseMap &IncomingGuards, + unsigned Depth); + public: /// Collect rewrite map for loop guards for loop \p L, together with flags /// indicating if NUW and NSW can be preserved during rewriting. diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index c939270ed39a65..8ab40dbbab7751 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -222,6 +222,10 @@ static cl::opt RangeIterThreshold( cl::desc("Threshold for switching to iteratively computing SCEV ranges"), cl::init(32)); +static cl::opt MaxLoopGuardCollectionDepth( + "scalar-evolution-max-loop-guard-collection-depth", cl::Hidden, + cl::desc("Maximum depth for recrusive loop guard collection"), cl::init(1)); + static cl::opt ClassifyExpressions("scalar-evolution-classify-expressions", cl::Hidden, cl::init(true), @@ -10648,7 +10652,7 @@ ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB) if (const Loop *L = LI.getLoopFor(BB)) return {L->getLoopPredecessor(), L->getHeader()}; - return {nullptr, nullptr}; + return {nullptr, BB}; } /// SCEV structural equivalence is usually sufficient for testing whether two @@ -15217,7 +15221,81 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS, ScalarEvolution::LoopGuards ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { + BasicBlock *Header = L->getHeader(); + BasicBlock *Pred = L->getLoopPredecessor(); LoopGuards Guards(SE); + SmallPtrSet VisitedBlocks; + collectFromBlock(SE, Guards, Header, Pred, VisitedBlocks); + return Guards; +} + +void ScalarEvolution::LoopGuards::collectFromPHI( + ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards, + const PHINode &Phi, SmallPtrSetImpl &VisitedBlocks, + SmallDenseMap &IncomingGuards, + unsigned Depth) { + if (!SE.isSCEVable(Phi.getType())) + return; + + using MinMaxPattern = std::pair; + auto GetMinMaxConst = [&](unsigned IncomingIdx) -> MinMaxPattern { + const BasicBlock *InBlock = Phi.getIncomingBlock(IncomingIdx); + if (!VisitedBlocks.insert(InBlock).second) + return {nullptr, scCouldNotCompute}; + auto [G, Inserted] = IncomingGuards.try_emplace(InBlock, LoopGuards(SE)); + if (Inserted) + collectFromBlock(SE, G->second, Phi.getParent(), InBlock, VisitedBlocks, + Depth + 1); + auto S = G->second.RewriteMap.find( + SE.getSCEV(Phi.getIncomingValue(IncomingIdx))); + if (S == G->second.RewriteMap.end()) + return {nullptr, scCouldNotCompute}; + auto *SM = dyn_cast_if_present(S->second); + if (!SM) + return {nullptr, scCouldNotCompute}; + if (const SCEVConstant *C0 = dyn_cast(SM->getOperand(0))) + return {C0, SM->getSCEVType()}; + if (const SCEVConstant *C1 = dyn_cast(SM->getOperand(1))) + return {C1, SM->getSCEVType()}; + return {nullptr, scCouldNotCompute}; + }; + auto MergeMinMaxConst = [](MinMaxPattern P1, + MinMaxPattern P2) -> MinMaxPattern { + auto [C1, T1] = P1; + auto [C2, T2] = P2; + if (!C1 || !C2 || T1 != T2) + return {nullptr, scCouldNotCompute}; + switch (T1) { + case scUMaxExpr: + return {C1->getAPInt().ult(C2->getAPInt()) ? C1 : C2, T1}; + case scSMaxExpr: + return {C1->getAPInt().slt(C2->getAPInt()) ? C1 : C2, T1}; + case scUMinExpr: + return {C1->getAPInt().ugt(C2->getAPInt()) ? C1 : C2, T1}; + case scSMinExpr: + return {C1->getAPInt().sgt(C2->getAPInt()) ? C1 : C2, T1}; + default: + llvm_unreachable("Trying to merge non-MinMaxExpr SCEVs."); + } + }; + auto P = GetMinMaxConst(0); + for (unsigned int In = 1; In < Phi.getNumIncomingValues(); In++) { + if (!P.first) + break; + P = MergeMinMaxConst(P, GetMinMaxConst(In)); + } + if (P.first) { + const SCEV *LHS = SE.getSCEV(const_cast(&Phi)); + SmallVector Ops({P.first, LHS}); + const SCEV *RHS = SE.getMinMaxExpr(P.second, Ops); + Guards.RewriteMap.insert({LHS, RHS}); + } +} + +void ScalarEvolution::LoopGuards::collectFromBlock( + ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards, + const BasicBlock *Block, const BasicBlock *Pred, + SmallPtrSetImpl &VisitedBlocks, unsigned Depth) { SmallVector ExprsToRewrite; auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS, @@ -15556,14 +15634,13 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { } }; - BasicBlock *Header = L->getHeader(); SmallVector> Terms; // First, collect information from assumptions dominating the loop. for (auto &AssumeVH : SE.AC.assumptions()) { if (!AssumeVH) continue; auto *AssumeI = cast(AssumeVH); - if (!SE.DT.dominates(AssumeI, Header)) + if (!SE.DT.dominates(AssumeI, Block)) continue; Terms.emplace_back(AssumeI->getOperand(0), true); } @@ -15574,8 +15651,8 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { if (GuardDecl) for (const auto *GU : GuardDecl->users()) if (const auto *Guard = dyn_cast(GU)) - if (Guard->getFunction() == Header->getParent() && - SE.DT.dominates(Guard, Header)) + if (Guard->getFunction() == Block->getParent() && + SE.DT.dominates(Guard, Block)) Terms.emplace_back(Guard->getArgOperand(0), true); // Third, collect conditions from dominating branches. Starting at the loop @@ -15583,11 +15660,10 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { // predecessors that can be found that have unique successors leading to the // original header. // TODO: share this logic with isLoopEntryGuardedByCond. - for (std::pair Pair( - L->getLoopPredecessor(), Header); - Pair.first; + std::pair Pair(Pred, Block); + for (; Pair.first; Pair = SE.getPredecessorWithUniqueSuccessorForBB(Pair.first)) { - + VisitedBlocks.insert(Pair.second); const BranchInst *LoopEntryPredicate = dyn_cast(Pair.first->getTerminator()); if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional()) @@ -15595,6 +15671,22 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { Terms.emplace_back(LoopEntryPredicate->getCondition(), LoopEntryPredicate->getSuccessor(0) == Pair.second); + + // If we are recursively collecting guards stop after 2 + // predecessors to limit compile-time impact for now. + if (Depth > 0 && Terms.size() == 2) + break; + } + // Finally, if we stopped climbing the predecessor chain because + // there wasn't a unique one to continue, try to collect conditions + // for PHINodes by recursively following all of their incoming + // blocks and try to merge the found conditions to build a new one + // for the Phi. + if (Pair.second->hasNPredecessorsOrMore(2) && + Depth < MaxLoopGuardCollectionDepth) { + SmallDenseMap IncomingGuards; + for (auto &Phi : Pair.second->phis()) + collectFromPHI(SE, Guards, Phi, VisitedBlocks, IncomingGuards, Depth); } // Now apply the information from the collected conditions to @@ -15651,7 +15743,6 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) { Guards.RewriteMap.insert({Expr, Guards.rewrite(RewriteTo)}); } } - return Guards; } const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const { diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll new file mode 100644 index 00000000000000..71d66ef04ade12 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll @@ -0,0 +1,279 @@ +; RUN: opt < %s -disable-output "-passes=print" -scalar-evolution-max-iterations=0 -scalar-evolution-classify-expressions=0 2>&1 | FileCheck %s + +define void @slt(i16 %a, i16 %b, i1 %c) { +; CHECK-LABEL: 'slt' +; CHECK-NEXT: Determining loop execution counts for: @slt +; CHECK-NEXT: Loop %loop: backedge-taken count is (19 + (-1 * %count)) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 18 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (19 + (-1 * %count)) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %b2 + +b1: + %cmp1 = icmp slt i16 %a, 1 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp slt i16 %b, 4 + br i1 %cmp2, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ] + %cmp3 = icmp sle i16 %count, 19 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, 1 + %exitcond = icmp eq i16 %iv.next, 20 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @ult(i16 %a, i16 %b, i1 %c) { +; CHECK-LABEL: 'ult' +; CHECK-NEXT: Determining loop execution counts for: @ult +; CHECK-NEXT: Loop %loop: backedge-taken count is (21 + (-1 * %count)) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 19 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (21 + (-1 * %count)) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %b2 + +b1: + %cmp1 = icmp ult i16 %a, 2 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp ult i16 %b, 5 + br i1 %cmp2, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ] + %cmp3 = icmp ule i16 %count, 20 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, 1 + %exitcond = icmp eq i16 %iv.next, 22 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @sgt(i16 %a, i16 %b, i1 %c) { +; CHECK-LABEL: 'sgt' +; CHECK-NEXT: Determining loop execution counts for: @sgt +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 9 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %b2 + +b1: + %cmp1 = icmp sgt i16 %a, 10 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp sgt i16 %b, 8 + br i1 %cmp2, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ] + %cmp3 = icmp sge i16 %count, 1 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, -1 + %exitcond = icmp eq i16 %iv.next, 0 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @ugt(i16 %a, i16 %b, i1 %c) { +; CHECK-LABEL: 'ugt' +; CHECK-NEXT: Determining loop execution counts for: @ugt +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 10 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %b2 + +b1: + %cmp1 = icmp ugt i16 %a, 11 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp ugt i16 %b, 7 + br i1 %cmp2, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ] + %cmp3 = icmp ne i16 %count, 0 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, -1 + %exitcond = icmp eq i16 %iv.next, 0 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @three_incoming(i16 %a, i16 %b, i1 %c, i1 %d) { +; CHECK-LABEL: 'three_incoming' +; CHECK-NEXT: Determining loop execution counts for: @three_incoming +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 11 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %entry2 + +entry2: + br i1 %d, label %b2, label %b3 + +b1: + %cmp1 = icmp ugt i16 %a, 10 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp ugt i16 %b, 8 + br i1 %cmp2, label %exit, label %preheader + +b3: + %cmp3 = icmp ugt i16 %b, 12 + br i1 %cmp3, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ], [ %b, %b3 ] + %cmp4 = icmp ne i16 %count, 0 + br i1 %cmp4, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, -1 + %exitcond = icmp eq i16 %iv.next, 0 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @mixed(i16 %a, i16 %b, i1 %c) { +; CHECK-LABEL: 'mixed' +; CHECK-NEXT: Determining loop execution counts for: @mixed +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 -2 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %b2 + +b1: + %cmp1 = icmp ugt i16 %a, 10 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp sgt i16 %b, 8 + br i1 %cmp2, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ] + %cmp3 = icmp ne i16 %count, 0 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, -1 + %exitcond = icmp eq i16 %iv.next, 0 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @one_constant(i16 %a, i16 %b, i1 %c, i16 %d) { +; CHECK-LABEL: 'one_constant' +; CHECK-NEXT: Determining loop execution counts for: @one_constant +; CHECK-NEXT: Loop %loop: backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: constant max backedge-taken count is i16 -2 +; CHECK-NEXT: Loop %loop: symbolic max backedge-taken count is (-1 + %count) +; CHECK-NEXT: Loop %loop: Trip multiple is 1 +entry: + br i1 %c, label %b1, label %b2 + +b1: + %cmp1 = icmp ugt i16 %a, 10 + br i1 %cmp1, label %exit, label %preheader + +b2: + %cmp2 = icmp ugt i16 %b, %d + br i1 %cmp2, label %exit, label %preheader + +preheader: + %count = phi i16 [ %a, %b1 ], [ %b, %b2 ] + %cmp3 = icmp ne i16 %count, 0 + br i1 %cmp3, label %loop, label %exit + +loop: + %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ] + %iv.next = add i16 %iv, -1 + %exitcond = icmp eq i16 %iv.next, 0 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +define void @epilogue(i64 %count) { +; CHECK-LABEL: 'epilogue' +; CHECK-NEXT: Determining loop execution counts for: @epilogue +; CHECK-NEXT: Loop %epilogue: backedge-taken count is (-1 + %count.epilogue) +; CHECK-NEXT: Loop %epilogue: constant max backedge-taken count is i64 6 +; CHECK-NEXT: Loop %epilogue: symbolic max backedge-taken count is (-1 + %count.epilogue) +; CHECK-NEXT: Loop %epilogue: Trip multiple is 1 +; CHECK-NEXT: Loop %while.body: backedge-taken count is ((-8 + %count) /u 8) +; CHECK-NEXT: Loop %while.body: constant max backedge-taken count is i64 2305843009213693951 +; CHECK-NEXT: Loop %while.body: symbolic max backedge-taken count is ((-8 + %count) /u 8) +; CHECK-NEXT: Loop %while.body: Trip multiple is 1 +entry: + %cmp = icmp ugt i64 %count, 7 + br i1 %cmp, label %while.body, label %epilogue.preheader + +while.body: + %iv = phi i64 [ %sub, %while.body ], [ %count, %entry ] + %sub = add i64 %iv, -8 + %exitcond.not = icmp ugt i64 %sub, 7 + br i1 %exitcond.not, label %while.body, label %while.loopexit + +while.loopexit: + %sub.exit = phi i64 [ %sub, %while.body ] + br label %epilogue.preheader + +epilogue.preheader: + %count.epilogue = phi i64 [ %count, %entry ], [ %sub.exit, %while.loopexit ] + %epilogue.cmp = icmp eq i64 %count.epilogue, 0 + br i1 %epilogue.cmp, label %exit, label %epilogue + +epilogue: + %iv.epilogue = phi i64 [ %dec, %epilogue ], [ %count.epilogue, %epilogue.preheader ] + %dec = add i64 %iv.epilogue, -1 + %exitcond.epilogue = icmp eq i64 %dec, 0 + br i1 %exitcond.epilogue, label %exit, label %epilogue + +exit: + ret void +} diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll index 70b002f766b753..966d7e3cded0ab 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll @@ -41,7 +41,7 @@ define void @apply_delta(ptr nocapture noundef %dst, ptr nocapture noundef reado ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[DST_ADDR_130]], i64 1 ; CHECK-NEXT: [[INCDEC_PTR8]] = getelementptr inbounds i8, ptr [[SRC_ADDR_129]], i64 1 ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[DEC]], 0 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]] ; CHECK: while.end9: ; CHECK-NEXT: ret void ;