From e403d25200ad75d1e65ee1c5ba2bfe64a7b6b5ab Mon Sep 17 00:00:00 2001
From: Julian Nagele <j.nagele@apple.com>
Date: Fri, 15 Nov 2024 10:03:08 +0000
Subject: [PATCH] [SCEV] Collect and merge loop guards through PHI nodes with
 multiple incoming values (#113915)

This patch aims to strengthen collection of loop guards by processing
PHI nodes with multiple incoming values as follows: collect guards for
all incoming values/blocks and try to merge them into a single one for
the PHI node.

The goal is to determine tighter bounds on the trip counts of scalar
tail loops after vectorization, helping to avoid unnecessary transforms.
In particular we'd like to avoid vectorizing scalar tails of
hand-vectorized loops, for example in
[Transforms/PhaseOrdering/X86/pr38280.ll](https://github.com/llvm/llvm-project/blob/231e03ba7e82896847dbc27d457dbb208f04699c/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll),
discovered via https://github.com/llvm/llvm-project/pull/108190

Compile-time impact: https://llvm-compile-time-tracker.com/compare.php?from=a55248789ed3f653740e0723d016203b9d585f26&to=500e4c46e79f60b93b11a752698c520e345948e3&stat=instructions:u

PR: https://github.com/llvm/llvm-project/pull/113915
---
 llvm/include/llvm/Analysis/ScalarEvolution.h  |  19 ++
 llvm/lib/Analysis/ScalarEvolution.cpp         | 111 ++++++-
 ...t-guard-info-with-multiple-predecessors.ll | 279 ++++++++++++++++++
 .../Transforms/PhaseOrdering/X86/pr38280.ll   |   2 +-
 4 files changed, 400 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 179a2c38d9d3c2..c254443e6ed018 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1316,6 +1316,25 @@ class ScalarEvolution {
 
     LoopGuards(ScalarEvolution &SE) : SE(SE) {}
 
+    /// Recursively collect loop guards in \p Guards, starting from
+    /// block \p Block with predecessor \p Pred. The intended starting point
+    /// is to collect from a loop header and its predecessor.
+    static void
+    collectFromBlock(ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
+                     const BasicBlock *Block, const BasicBlock *Pred,
+                     SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks,
+                     unsigned Depth = 0);
+
+    /// Collect loop guards in \p Guards, starting from PHINode \p
+    /// Phi, by calling \p collectFromBlock on the incoming blocks of
+    /// \Phi and trying to merge the found constraints into a single
+    /// combined on for \p Phi.
+    static void collectFromPHI(
+        ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
+        const PHINode &Phi, SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks,
+        SmallDenseMap<const BasicBlock *, LoopGuards> &IncomingGuards,
+        unsigned Depth);
+
   public:
     /// Collect rewrite map for loop guards for loop \p L, together with flags
     /// indicating if NUW and NSW can be preserved during rewriting.
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index c939270ed39a65..8ab40dbbab7751 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -222,6 +222,10 @@ static cl::opt<unsigned> RangeIterThreshold(
     cl::desc("Threshold for switching to iteratively computing SCEV ranges"),
     cl::init(32));
 
+static cl::opt<unsigned> MaxLoopGuardCollectionDepth(
+    "scalar-evolution-max-loop-guard-collection-depth", cl::Hidden,
+    cl::desc("Maximum depth for recrusive loop guard collection"), cl::init(1));
+
 static cl::opt<bool>
 ClassifyExpressions("scalar-evolution-classify-expressions",
     cl::Hidden, cl::init(true),
@@ -10648,7 +10652,7 @@ ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB)
   if (const Loop *L = LI.getLoopFor(BB))
     return {L->getLoopPredecessor(), L->getHeader()};
 
-  return {nullptr, nullptr};
+  return {nullptr, BB};
 }
 
 /// SCEV structural equivalence is usually sufficient for testing whether two
@@ -15217,7 +15221,81 @@ bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
 
 ScalarEvolution::LoopGuards
 ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Pred = L->getLoopPredecessor();
   LoopGuards Guards(SE);
+  SmallPtrSet<const BasicBlock *, 8> VisitedBlocks;
+  collectFromBlock(SE, Guards, Header, Pred, VisitedBlocks);
+  return Guards;
+}
+
+void ScalarEvolution::LoopGuards::collectFromPHI(
+    ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
+    const PHINode &Phi, SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks,
+    SmallDenseMap<const BasicBlock *, LoopGuards> &IncomingGuards,
+    unsigned Depth) {
+  if (!SE.isSCEVable(Phi.getType()))
+    return;
+
+  using MinMaxPattern = std::pair<const SCEVConstant *, SCEVTypes>;
+  auto GetMinMaxConst = [&](unsigned IncomingIdx) -> MinMaxPattern {
+    const BasicBlock *InBlock = Phi.getIncomingBlock(IncomingIdx);
+    if (!VisitedBlocks.insert(InBlock).second)
+      return {nullptr, scCouldNotCompute};
+    auto [G, Inserted] = IncomingGuards.try_emplace(InBlock, LoopGuards(SE));
+    if (Inserted)
+      collectFromBlock(SE, G->second, Phi.getParent(), InBlock, VisitedBlocks,
+                       Depth + 1);
+    auto S = G->second.RewriteMap.find(
+        SE.getSCEV(Phi.getIncomingValue(IncomingIdx)));
+    if (S == G->second.RewriteMap.end())
+      return {nullptr, scCouldNotCompute};
+    auto *SM = dyn_cast_if_present<SCEVMinMaxExpr>(S->second);
+    if (!SM)
+      return {nullptr, scCouldNotCompute};
+    if (const SCEVConstant *C0 = dyn_cast<SCEVConstant>(SM->getOperand(0)))
+      return {C0, SM->getSCEVType()};
+    if (const SCEVConstant *C1 = dyn_cast<SCEVConstant>(SM->getOperand(1)))
+      return {C1, SM->getSCEVType()};
+    return {nullptr, scCouldNotCompute};
+  };
+  auto MergeMinMaxConst = [](MinMaxPattern P1,
+                             MinMaxPattern P2) -> MinMaxPattern {
+    auto [C1, T1] = P1;
+    auto [C2, T2] = P2;
+    if (!C1 || !C2 || T1 != T2)
+      return {nullptr, scCouldNotCompute};
+    switch (T1) {
+    case scUMaxExpr:
+      return {C1->getAPInt().ult(C2->getAPInt()) ? C1 : C2, T1};
+    case scSMaxExpr:
+      return {C1->getAPInt().slt(C2->getAPInt()) ? C1 : C2, T1};
+    case scUMinExpr:
+      return {C1->getAPInt().ugt(C2->getAPInt()) ? C1 : C2, T1};
+    case scSMinExpr:
+      return {C1->getAPInt().sgt(C2->getAPInt()) ? C1 : C2, T1};
+    default:
+      llvm_unreachable("Trying to merge non-MinMaxExpr SCEVs.");
+    }
+  };
+  auto P = GetMinMaxConst(0);
+  for (unsigned int In = 1; In < Phi.getNumIncomingValues(); In++) {
+    if (!P.first)
+      break;
+    P = MergeMinMaxConst(P, GetMinMaxConst(In));
+  }
+  if (P.first) {
+    const SCEV *LHS = SE.getSCEV(const_cast<PHINode *>(&Phi));
+    SmallVector<const SCEV *, 2> Ops({P.first, LHS});
+    const SCEV *RHS = SE.getMinMaxExpr(P.second, Ops);
+    Guards.RewriteMap.insert({LHS, RHS});
+  }
+}
+
+void ScalarEvolution::LoopGuards::collectFromBlock(
+    ScalarEvolution &SE, ScalarEvolution::LoopGuards &Guards,
+    const BasicBlock *Block, const BasicBlock *Pred,
+    SmallPtrSetImpl<const BasicBlock *> &VisitedBlocks, unsigned Depth) {
   SmallVector<const SCEV *> ExprsToRewrite;
   auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
                               const SCEV *RHS,
@@ -15556,14 +15634,13 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
     }
   };
 
-  BasicBlock *Header = L->getHeader();
   SmallVector<PointerIntPair<Value *, 1, bool>> Terms;
   // First, collect information from assumptions dominating the loop.
   for (auto &AssumeVH : SE.AC.assumptions()) {
     if (!AssumeVH)
       continue;
     auto *AssumeI = cast<CallInst>(AssumeVH);
-    if (!SE.DT.dominates(AssumeI, Header))
+    if (!SE.DT.dominates(AssumeI, Block))
       continue;
     Terms.emplace_back(AssumeI->getOperand(0), true);
   }
@@ -15574,8 +15651,8 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
   if (GuardDecl)
     for (const auto *GU : GuardDecl->users())
       if (const auto *Guard = dyn_cast<IntrinsicInst>(GU))
-        if (Guard->getFunction() == Header->getParent() &&
-            SE.DT.dominates(Guard, Header))
+        if (Guard->getFunction() == Block->getParent() &&
+            SE.DT.dominates(Guard, Block))
           Terms.emplace_back(Guard->getArgOperand(0), true);
 
   // Third, collect conditions from dominating branches. Starting at the loop
@@ -15583,11 +15660,10 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
   // predecessors that can be found that have unique successors leading to the
   // original header.
   // TODO: share this logic with isLoopEntryGuardedByCond.
-  for (std::pair<const BasicBlock *, const BasicBlock *> Pair(
-           L->getLoopPredecessor(), Header);
-       Pair.first;
+  std::pair<const BasicBlock *, const BasicBlock *> Pair(Pred, Block);
+  for (; Pair.first;
        Pair = SE.getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
-
+    VisitedBlocks.insert(Pair.second);
     const BranchInst *LoopEntryPredicate =
         dyn_cast<BranchInst>(Pair.first->getTerminator());
     if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional())
@@ -15595,6 +15671,22 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
 
     Terms.emplace_back(LoopEntryPredicate->getCondition(),
                        LoopEntryPredicate->getSuccessor(0) == Pair.second);
+
+    // If we are recursively collecting guards stop after 2
+    // predecessors to limit compile-time impact for now.
+    if (Depth > 0 && Terms.size() == 2)
+      break;
+  }
+  // Finally, if we stopped climbing the predecessor chain because
+  // there wasn't a unique one to continue, try to collect conditions
+  // for PHINodes by recursively following all of their incoming
+  // blocks and try to merge the found conditions to build a new one
+  // for the Phi.
+  if (Pair.second->hasNPredecessorsOrMore(2) &&
+      Depth < MaxLoopGuardCollectionDepth) {
+    SmallDenseMap<const BasicBlock *, LoopGuards> IncomingGuards;
+    for (auto &Phi : Pair.second->phis())
+      collectFromPHI(SE, Guards, Phi, VisitedBlocks, IncomingGuards, Depth);
   }
 
   // Now apply the information from the collected conditions to
@@ -15651,7 +15743,6 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
       Guards.RewriteMap.insert({Expr, Guards.rewrite(RewriteTo)});
     }
   }
-  return Guards;
 }
 
 const SCEV *ScalarEvolution::LoopGuards::rewrite(const SCEV *Expr) const {
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
new file mode 100644
index 00000000000000..71d66ef04ade12
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
@@ -0,0 +1,279 @@
+; RUN: opt < %s -disable-output "-passes=print<scalar-evolution>" -scalar-evolution-max-iterations=0  -scalar-evolution-classify-expressions=0  2>&1 | FileCheck %s
+
+define void @slt(i16 %a, i16 %b, i1 %c) {
+; CHECK-LABEL: 'slt'
+; CHECK-NEXT:  Determining loop execution counts for: @slt
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (19 + (-1 * %count)<nsw>)<nsw>
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i16 18
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (19 + (-1 * %count)<nsw>)<nsw>
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+entry:
+  br i1 %c, label %b1, label %b2
+
+b1:
+  %cmp1 = icmp slt i16 %a, 1
+  br i1 %cmp1, label %exit, label %preheader
+
+b2:
+  %cmp2 = icmp slt i16 %b, 4
+  br i1 %cmp2, label %exit, label %preheader
+
+preheader:
+  %count = phi i16 [ %a, %b1 ], [ %b, %b2 ]
+  %cmp3 = icmp sle i16 %count, 19
+  br i1 %cmp3, label %loop, label %exit
+
+loop:
+  %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ]
+  %iv.next = add i16 %iv, 1
+  %exitcond = icmp eq i16 %iv.next, 20
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @ult(i16 %a, i16 %b, i1 %c) {
+; CHECK-LABEL: 'ult'
+; CHECK-NEXT:  Determining loop execution counts for: @ult
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (21 + (-1 * %count))
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i16 19
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (21 + (-1 * %count))
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+entry:
+  br i1 %c, label %b1, label %b2
+
+b1:
+  %cmp1 = icmp ult i16 %a, 2
+  br i1 %cmp1, label %exit, label %preheader
+
+b2:
+  %cmp2 = icmp ult i16 %b, 5
+  br i1 %cmp2, label %exit, label %preheader
+
+preheader:
+  %count = phi i16 [ %a, %b1 ], [ %b, %b2 ]
+  %cmp3 = icmp ule i16 %count, 20
+  br i1 %cmp3, label %loop, label %exit
+
+loop:
+  %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ]
+  %iv.next = add i16 %iv, 1
+  %exitcond = icmp eq i16 %iv.next, 22
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @sgt(i16 %a, i16 %b, i1 %c) {
+; CHECK-LABEL: 'sgt'
+; CHECK-NEXT:  Determining loop execution counts for: @sgt
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (-1 + %count)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i16 9
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-1 + %count)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+entry:
+  br i1 %c, label %b1, label %b2
+
+b1:
+  %cmp1 = icmp sgt i16 %a, 10
+  br i1 %cmp1, label %exit, label %preheader
+
+b2:
+  %cmp2 = icmp sgt i16 %b, 8
+  br i1 %cmp2, label %exit, label %preheader
+
+preheader:
+  %count = phi i16 [ %a, %b1 ], [ %b, %b2 ]
+  %cmp3 = icmp sge i16 %count, 1
+  br i1 %cmp3, label %loop, label %exit
+
+loop:
+  %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ]
+  %iv.next = add i16 %iv, -1
+  %exitcond = icmp eq i16 %iv.next, 0
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @ugt(i16 %a, i16 %b, i1 %c) {
+; CHECK-LABEL: 'ugt'
+; CHECK-NEXT:  Determining loop execution counts for: @ugt
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (-1 + %count)<nsw>
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i16 10
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-1 + %count)<nsw>
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+entry:
+  br i1 %c, label %b1, label %b2
+
+b1:
+  %cmp1 = icmp ugt i16 %a, 11
+  br i1 %cmp1, label %exit, label %preheader
+
+b2:
+  %cmp2 = icmp ugt i16 %b, 7
+  br i1 %cmp2, label %exit, label %preheader
+
+preheader:
+  %count = phi i16 [ %a, %b1 ], [ %b, %b2 ]
+  %cmp3 = icmp ne i16 %count, 0
+  br i1 %cmp3, label %loop, label %exit
+
+loop:
+  %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ]
+  %iv.next = add i16 %iv, -1
+  %exitcond = icmp eq i16 %iv.next, 0
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @three_incoming(i16 %a, i16 %b, i1 %c, i1 %d) {
+; CHECK-LABEL: 'three_incoming'
+; CHECK-NEXT:  Determining loop execution counts for: @three_incoming
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (-1 + %count)<nsw>
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i16 11
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-1 + %count)<nsw>
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+entry:
+  br i1 %c, label %b1, label %entry2
+
+entry2:
+  br i1 %d, label %b2, label %b3
+
+b1:
+  %cmp1 = icmp ugt i16 %a, 10
+  br i1 %cmp1, label %exit, label %preheader
+
+b2:
+  %cmp2 = icmp ugt i16 %b, 8
+  br i1 %cmp2, label %exit, label %preheader
+
+b3:
+  %cmp3 = icmp ugt i16 %b, 12
+  br i1 %cmp3, label %exit, label %preheader
+
+preheader:
+  %count = phi i16 [ %a, %b1 ], [ %b, %b2 ], [ %b, %b3 ]
+  %cmp4 = icmp ne i16 %count, 0
+  br i1 %cmp4, label %loop, label %exit
+
+loop:
+  %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ]
+  %iv.next = add i16 %iv, -1
+  %exitcond = icmp eq i16 %iv.next, 0
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @mixed(i16 %a, i16 %b, i1 %c) {
+; CHECK-LABEL: 'mixed'
+; CHECK-NEXT:  Determining loop execution counts for: @mixed
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (-1 + %count)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i16 -2
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-1 + %count)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+entry:
+  br i1 %c, label %b1, label %b2
+
+b1:
+  %cmp1 = icmp ugt i16 %a, 10
+  br i1 %cmp1, label %exit, label %preheader
+
+b2:
+  %cmp2 = icmp sgt i16 %b, 8
+  br i1 %cmp2, label %exit, label %preheader
+
+preheader:
+  %count = phi i16 [ %a, %b1 ], [ %b, %b2 ]
+  %cmp3 = icmp ne i16 %count, 0
+  br i1 %cmp3, label %loop, label %exit
+
+loop:
+  %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ]
+  %iv.next = add i16 %iv, -1
+  %exitcond = icmp eq i16 %iv.next, 0
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @one_constant(i16 %a, i16 %b, i1 %c, i16 %d) {
+; CHECK-LABEL: 'one_constant'
+; CHECK-NEXT:  Determining loop execution counts for: @one_constant
+; CHECK-NEXT:  Loop %loop: backedge-taken count is (-1 + %count)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i16 -2
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-1 + %count)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+entry:
+  br i1 %c, label %b1, label %b2
+
+b1:
+  %cmp1 = icmp ugt i16 %a, 10
+  br i1 %cmp1, label %exit, label %preheader
+
+b2:
+  %cmp2 = icmp ugt i16 %b, %d
+  br i1 %cmp2, label %exit, label %preheader
+
+preheader:
+  %count = phi i16 [ %a, %b1 ], [ %b, %b2 ]
+  %cmp3 = icmp ne i16 %count, 0
+  br i1 %cmp3, label %loop, label %exit
+
+loop:
+  %iv = phi i16 [ %iv.next, %loop ], [ %count, %preheader ]
+  %iv.next = add i16 %iv, -1
+  %exitcond = icmp eq i16 %iv.next, 0
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @epilogue(i64 %count) {
+; CHECK-LABEL: 'epilogue'
+; CHECK-NEXT:  Determining loop execution counts for: @epilogue
+; CHECK-NEXT:  Loop %epilogue: backedge-taken count is (-1 + %count.epilogue)
+; CHECK-NEXT:  Loop %epilogue: constant max backedge-taken count is i64 6
+; CHECK-NEXT:  Loop %epilogue: symbolic max backedge-taken count is (-1 + %count.epilogue)
+; CHECK-NEXT:  Loop %epilogue: Trip multiple is 1
+; CHECK-NEXT:  Loop %while.body: backedge-taken count is ((-8 + %count) /u 8)
+; CHECK-NEXT:  Loop %while.body: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %while.body: symbolic max backedge-taken count is ((-8 + %count) /u 8)
+; CHECK-NEXT:  Loop %while.body: Trip multiple is 1
+entry:
+  %cmp = icmp ugt i64 %count, 7
+  br i1 %cmp, label %while.body, label %epilogue.preheader
+
+while.body:
+  %iv = phi i64 [ %sub, %while.body ], [ %count, %entry ]
+  %sub = add i64 %iv, -8
+  %exitcond.not = icmp ugt i64 %sub, 7
+  br i1 %exitcond.not, label %while.body, label %while.loopexit
+
+while.loopexit:
+  %sub.exit = phi i64 [ %sub, %while.body ]
+  br label %epilogue.preheader
+
+epilogue.preheader:
+  %count.epilogue = phi i64 [ %count, %entry ], [ %sub.exit, %while.loopexit ]
+  %epilogue.cmp = icmp eq i64 %count.epilogue, 0
+  br i1 %epilogue.cmp, label %exit, label %epilogue
+
+epilogue:
+  %iv.epilogue = phi i64 [ %dec, %epilogue ], [ %count.epilogue, %epilogue.preheader ]
+  %dec = add i64 %iv.epilogue, -1
+  %exitcond.epilogue = icmp eq i64 %dec, 0
+  br i1 %exitcond.epilogue, label %exit, label %epilogue
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll
index 70b002f766b753..966d7e3cded0ab 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr38280.ll
@@ -41,7 +41,7 @@ define void @apply_delta(ptr nocapture noundef %dst, ptr nocapture noundef reado
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[DST_ADDR_130]], i64 1
 ; CHECK-NEXT:    [[INCDEC_PTR8]] = getelementptr inbounds i8, ptr [[SRC_ADDR_129]], i64 1
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END9]], label [[WHILE_BODY4]]
 ; CHECK:       while.end9:
 ; CHECK-NEXT:    ret void
 ;