From 22a4d4f35fbf6862631aaa8e7ad10fd3fcd32b92 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Mon, 13 Jan 2020 23:49:03 +0100
Subject: [PATCH 1/7] [InstCombine] Add test for iterator invalidation bug; NFC

---
 .../InstCombine/bitcast-phi-uselistorder.ll   | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/bitcast-phi-uselistorder.ll
diff --git a/llvm/test/Transforms/InstCombine/bitcast-phi-uselistorder.ll b/llvm/test/Transforms/InstCombine/bitcast-phi-uselistorder.ll
new file mode 100644
index 0000000000000..d1af21f6c4d51
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/bitcast-phi-uselistorder.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+@Q = internal unnamed_addr global double 1.000000e+00, align 8
+
+define double @test(i1 %c, i64* %p) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[END:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, i64* bitcast (double* @Q to i64*), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[LOAD]] to double
+; CHECK-NEXT:    [[PHITMP:%.*]] = bitcast i64 [[LOAD]] to double
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[PHITMP]], [[IF]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P:%.*]] to double*
+; CHECK-NEXT:    store double [[TMP1]], double* [[TMP2]], align 8
+; CHECK-NEXT:    ret double [[PHI]]
+;
+entry:
+  br i1 %c, label %if, label %end
+
+if:
+  %load = load i64, i64* bitcast (double* @Q to i64*), align 8
+  br label %end
+
+end:
+  %phi = phi i64 [ 0, %entry ], [ %load, %if ]
+  store i64 %phi, i64* %p, align 8
+  %cast = bitcast i64 %phi to double
+  ret double %cast
+
+  uselistorder i64 %phi, { 1, 0 }
+}

From bccfb333df67477f1312f68d319856513582ecd6 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Mon, 13 Jan 2020 23:54:42 +0100
Subject: [PATCH 2/7] [InstCombine] Fix user iterator invalidation in bitcast
 of phi transform

This fixes the issue encountered in D71164. Instead of using a
range-based for, manually iterate over the users and advance the
iterator beforehand, so we do not skip any users due to iterator
invalidation.

Differential Revision: https://reviews.llvm.org/D72657
---
 .../lib/Transforms/InstCombine/InstCombineCasts.cpp |  5 ++++-
 .../InstCombine/bitcast-phi-uselistorder.ll         | 13 +++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 2c9ba203fbf3d..ae90bf646d0f5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2259,7 +2259,10 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
   Instruction *RetVal = nullptr;
   for (auto *OldPN : OldPhiNodes) {
     PHINode *NewPN = NewPNodes[OldPN];
-    for (User *V : OldPN->users()) {
+    for (auto It = OldPN->user_begin(), End = OldPN->user_end(); It != End; ) {
+      User *V = *It;
+      // We may remove this user, advance to avoid iterator invalidation.
+      ++It;
       if (auto *SI = dyn_cast<StoreInst>(V)) {
         if (SI->isSimple() && SI->getOperand(0) == OldPN) {
           Builder.SetInsertPoint(SI);
diff --git a/llvm/test/Transforms/InstCombine/bitcast-phi-uselistorder.ll b/llvm/test/Transforms/InstCombine/bitcast-phi-uselistorder.ll
index d1af21f6c4d51..d5489484bddf2 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-phi-uselistorder.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-phi-uselistorder.ll
@@ -8,16 +8,13 @@ define double @test(i1 %c, i64* %p) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[END:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, i64* bitcast (double* @Q to i64*), align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[LOAD]] to double
-; CHECK-NEXT:    [[PHITMP:%.*]] = bitcast i64 [[LOAD]] to double
+; CHECK-NEXT:    [[LOAD1:%.*]] = load double, double* @Q, align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF]] ]
-; CHECK-NEXT:    [[PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[PHITMP]], [[IF]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P:%.*]] to double*
-; CHECK-NEXT:    store double [[TMP1]], double* [[TMP2]], align 8
-; CHECK-NEXT:    ret double [[PHI]]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[LOAD1]], [[IF]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[P:%.*]] to double*
+; CHECK-NEXT:    store double [[TMP0]], double* [[TMP1]], align 8
+; CHECK-NEXT:    ret double [[TMP0]]
 ;
 entry:
   br i1 %c, label %if, label %end

From e35eeb67f261205c55ab7450490ba23f5d2fd441 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Mon, 13 Jan 2020 18:56:28 +0100
Subject: [PATCH 3/7] [InstCombine] Make combineLoadToNewType a method; NFC

So it can be reused as part of other combines.
In particular for D71164.

Conflicts:
	llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
---
 .../InstCombine/InstCombineInternal.h         |  3 +++
 .../InstCombineLoadStoreAlloca.cpp            | 27 +++++++++----------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 434b0d5912157..c2951cc6f4c4f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -405,6 +405,9 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
   /// \return true if successful.
   bool replacePointer(Instruction &I, Value *V);
 
+  LoadInst *combineLoadToNewType(LoadInst &LI, Type *NewTy,
+                                 const Twine &Suffix = "");
+
 private:
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 054fb7da09a22..8a80efa729b19 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -448,8 +448,8 @@ static bool isSupportedAtomicType(Type *Ty) {
 ///
 /// Note that this will create all of the instructions with whatever insert
 /// point the \c InstCombiner currently is using.
-static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewTy,
-                                      const Twine &Suffix = "") {
+LoadInst *InstCombiner::combineLoadToNewType(LoadInst &LI, Type *NewTy,
+                                             const Twine &Suffix) {
   assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
          "can't fold an atomic load to requested type");
 
@@ -462,9 +462,9 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
   if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
         NewPtr->getType()->getPointerElementType() == NewTy &&
         NewPtr->getType()->getPointerAddressSpace() == AS))
-    NewPtr = IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
+    NewPtr = Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
 
-  LoadInst *NewLoad = IC.Builder.CreateAlignedLoad(
+  LoadInst *NewLoad = Builder.CreateAlignedLoad(
       NewTy, NewPtr, LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix);
   NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
   MDBuilder MDB(NewLoad->getContext());
@@ -505,7 +505,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
         NewLoad->setMetadata(ID, N);
       break;
     case LLVMContext::MD_range:
-      copyRangeMetadata(IC.getDataLayout(), LI, N, *NewLoad);
+      copyRangeMetadata(getDataLayout(), LI, N, *NewLoad);
       break;
     }
   }
@@ -639,9 +639,8 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
           return SI && SI->getPointerOperand() != &LI &&
                  !SI->getPointerOperand()->isSwiftError();
         })) {
-      LoadInst *NewLoad = combineLoadToNewType(
-          IC, LI,
-          Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty)));
+      LoadInst *NewLoad = IC.combineLoadToNewType(
+          LI, Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty)));
       // Replace all the stores with stores of the newly loaded value.
       for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) {
         auto *SI = cast<StoreInst>(*UI++);
@@ -663,7 +662,7 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
     if (auto* CI = dyn_cast<CastInst>(LI.user_back()))
       if (CI->isNoopCast(DL))
         if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
-          LoadInst *NewLoad = combineLoadToNewType(IC, LI, CI->getDestTy());
+          LoadInst *NewLoad = IC.combineLoadToNewType(LI, CI->getDestTy());
           CI->replaceAllUsesWith(NewLoad);
           IC.eraseInstFromFunction(*CI);
           return &LI;
@@ -691,8 +690,8 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     // If the struct only have one element, we unpack.
     auto NumElements = ST->getNumElements();
     if (NumElements == 1) {
-      LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U),
-                                               ".unpack");
+      LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
+                                                  ".unpack");
       AAMDNodes AAMD;
       LI.getAAMetadata(AAMD);
       NewLoad->setAAMetadata(AAMD);
@@ -741,7 +740,7 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     auto *ET = AT->getElementType();
     auto NumElements = AT->getNumElements();
     if (NumElements == 1) {
-      LoadInst *NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack");
+      LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack");
       AAMDNodes AAMD;
       LI.getAAMetadata(AAMD);
       NewLoad->setAAMetadata(AAMD);
@@ -1377,8 +1376,8 @@ static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC,
     return false;
 
   IC.Builder.SetInsertPoint(LI);
-  LoadInst *NewLI = combineLoadToNewType(
-      IC, *LI, LoadAddr->getType()->getPointerElementType());
+  LoadInst *NewLI = IC.combineLoadToNewType(
+      *LI, LoadAddr->getType()->getPointerElementType());
   // Replace all the stores with stores of the newly loaded value.
   for (auto *UI : LI->users()) {
     auto *USI = cast<StoreInst>(UI);

From 1beab05cb34483eca08786fb43a17ab301bf9a68 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Mon, 13 Jan 2020 18:57:14 +0100
Subject: [PATCH 4/7] [InstCombine] Fix infinite loop due to bitcast <-> phi
 transforms

Fix for https://bugs.llvm.org/show_bug.cgi?id=44245.

The optimizeBitCastFromPhi() and FoldPHIArgOpIntoPHI() end up
fighting against each other, because optimizeBitCastFromPhi()
assumes that bitcasts of loads will get folded. This doesn't
happen here, because a dangling phi node prevents the one-use
fold in https://github.com/llvm/llvm-project/blob/master/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp#L620-L628 from triggering.

This patch fixes the issue by explicitly performing the load
combine as part of the bitcast of phi transform. Other attempts
to force the load to be combined first were ultimately too
unreliable.

Differential Revision: https://reviews.llvm.org/D71164
---
 .../InstCombine/InstCombineCasts.cpp          |  11 +-
 llvm/test/Transforms/InstCombine/pr44245.ll   | 192 ++++++++++++++++++
 2 files changed, 200 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/pr44245.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index ae90bf646d0f5..20d6242901eaa 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2234,9 +2234,14 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
       if (auto *C = dyn_cast<Constant>(V)) {
         NewV = ConstantExpr::getBitCast(C, DestTy);
       } else if (auto *LI = dyn_cast<LoadInst>(V)) {
-        Builder.SetInsertPoint(LI->getNextNode());
-        NewV = Builder.CreateBitCast(LI, DestTy);
-        Worklist.Add(LI);
+        // Explicitly perform load combine to make sure no opposing transform
+        // can remove the bitcast in the meantime and trigger an infinite loop.
+        Builder.SetInsertPoint(LI);
+        NewV = combineLoadToNewType(*LI, DestTy);
+        // Remove the old load and its use in the old phi, which itself becomes
+        // dead once the whole transform finishes.
+        replaceInstUsesWith(*LI, UndefValue::get(LI->getType()));
+        eraseInstFromFunction(*LI);
       } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
         NewV = BCI->getOperand(0);
       } else if (auto *PrevPN = dyn_cast<PHINode>(V)) {
diff --git a/llvm/test/Transforms/InstCombine/pr44245.ll b/llvm/test/Transforms/InstCombine/pr44245.ll
new file mode 100644
index 0000000000000..f75e26e87add8
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr44245.ll
@@ -0,0 +1,192 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine -instcombine-infinite-loop-threshold=2 < %s | FileCheck %s
+
+; This used to cause on infinite instcombine loop.
+
+define void @test(i1 %c) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  bb16:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB17:%.*]], label [[BB24:%.*]]
+; CHECK:       bb17:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i8* [ [[TMP1:%.*]], [[BB47:%.*]] ], [ undef, [[BB16:%.*]] ]
+; CHECK-NEXT:    store i8* [[TMP0]], i8** undef, align 8
+; CHECK-NEXT:    ret void
+; CHECK:       bb24:
+; CHECK-NEXT:    br i1 [[C]], label [[BB44:%.*]], label [[BB49:%.*]]
+; CHECK:       bb44:
+; CHECK-NEXT:    [[TMP467:%.*]] = load i8*, i8** inttoptr (i64 16 to i8**), align 16
+; CHECK-NEXT:    br label [[BB47]]
+; CHECK:       bb47:
+; CHECK-NEXT:    [[TMP1]] = phi i8* [ [[TMP2:%.*]], [[BB150:%.*]] ], [ [[TMP1221:%.*]], [[BB119:%.*]] ], [ [[TMP1032:%.*]], [[BB101:%.*]] ], [ [[TMP933:%.*]], [[BB91:%.*]] ], [ [[TMP834:%.*]], [[BB81:%.*]] ], [ [[TMP705:%.*]], [[BB67:%.*]] ], [ [[TMP586:%.*]], [[BB56:%.*]] ], [ [[TMP467]], [[BB44]] ]
+; CHECK-NEXT:    br label [[BB17]]
+; CHECK:       bb49:
+; CHECK-NEXT:    br i1 [[C]], label [[BB56]], label [[BB59:%.*]]
+; CHECK:       bb56:
+; CHECK-NEXT:    [[TMP586]] = load i8*, i8** inttoptr (i64 16 to i8**), align 16
+; CHECK-NEXT:    br label [[BB47]]
+; CHECK:       bb59:
+; CHECK-NEXT:    br i1 [[C]], label [[BB67]], label [[BB71:%.*]]
+; CHECK:       bb67:
+; CHECK-NEXT:    [[TMP705]] = load i8*, i8** inttoptr (i64 16 to i8**), align 16
+; CHECK-NEXT:    br label [[BB47]]
+; CHECK:       bb71:
+; CHECK-NEXT:    br i1 [[C]], label [[BB81]], label [[BB84:%.*]]
+; CHECK:       bb81:
+; CHECK-NEXT:    [[TMP834]] = load i8*, i8** inttoptr (i64 16 to i8**), align 16
+; CHECK-NEXT:    br label [[BB47]]
+; CHECK:       bb84:
+; CHECK-NEXT:    br i1 [[C]], label [[BB91]], label [[BB94:%.*]]
+; CHECK:       bb91:
+; CHECK-NEXT:    [[TMP933]] = load i8*, i8** inttoptr (i64 16 to i8**), align 16
+; CHECK-NEXT:    br label [[BB47]]
+; CHECK:       bb94:
+; CHECK-NEXT:    br i1 [[C]], label [[BB101]], label [[BB104:%.*]]
+; CHECK:       bb101:
+; CHECK-NEXT:    [[TMP1032]] = load i8*, i8** inttoptr (i64 16 to i8**), align 16
+; CHECK-NEXT:    br label [[BB47]]
+; CHECK:       bb104:
+; CHECK-NEXT:    br i1 [[C]], label [[BB119]], label [[BB123:%.*]]
+; CHECK:       bb119:
+; CHECK-NEXT:    [[TMP1221]] = load i8*, i8** inttoptr (i64 16 to i8**), align 16
+; CHECK-NEXT:    br label [[BB47]]
+; CHECK:       bb123:
+; CHECK-NEXT:    br i1 [[C]], label [[BB147:%.*]], label [[BB152:%.*]]
+; CHECK:       bb147:
+; CHECK-NEXT:    [[TMP1499:%.*]] = load i8*, i8** inttoptr (i64 16 to i8**), align 16
+; CHECK-NEXT:    br label [[BB150]]
+; CHECK:       bb150:
+; CHECK-NEXT:    [[TMP2]] = phi i8* [ [[TMP1848:%.*]], [[BB152]] ], [ [[TMP1499]], [[BB147]] ]
+; CHECK-NEXT:    br label [[BB47]]
+; CHECK:       bb152:
+; CHECK-NEXT:    [[TMP1848]] = load i8*, i8** inttoptr (i64 16 to i8**), align 16
+; CHECK-NEXT:    call void undef()
+; CHECK-NEXT:    br label [[BB150]]
+;
+bb16:                                             ; preds = %bb
+  br i1 %c, label %bb17, label %bb24
+
+bb17:                                             ; preds = %bb47, %bb17
+  %0 = phi i8* [ %1, %bb47 ], [ undef, %bb16 ]
+  store i8* %0, i8** undef, align 8
+  ret void
+
+bb24:                                             ; preds = %bb24
+  br i1 %c, label %bb44, label %bb49
+
+bb44:                                             ; preds = %bb43
+  %tmp46 = load i64*, i64** inttoptr (i64 16 to i64**), align 16
+  br label %bb47
+
+bb47:                                             ; preds = %bb150, %bb119, %bb101, %bb91, %bb81, %bb67, %bb56, %bb44
+  %.in1 = phi i64* [ %.in, %bb150 ], [ %tmp122, %bb119 ], [ %tmp103, %bb101 ], [ %tmp93, %bb91 ], [ %tmp83, %bb81 ], [ %tmp70, %bb67 ], [ %tmp58, %bb56 ], [ %tmp46, %bb44 ]
+  %1 = bitcast i64* %.in1 to i8*
+  br label %bb17
+
+bb49:                                             ; preds = %bb49
+  br i1 %c, label %bb56, label %bb59
+
+bb56:                                             ; preds = %bb55
+  %tmp58 = load i64*, i64** inttoptr (i64 16 to i64**), align 16
+  br label %bb47
+
+bb59:                                             ; preds = %bb59
+  br i1 %c, label %bb67, label %bb71
+
+bb67:                                             ; preds = %bb66
+  %tmp70 = load i64*, i64** inttoptr (i64 16 to i64**), align 16
+  br label %bb47
+
+bb71:                                             ; preds = %bb71
+  br i1 %c, label %bb81, label %bb84
+
+bb81:                                             ; preds = %bb80
+  %tmp83 = load i64*, i64** inttoptr (i64 16 to i64**), align 16
+  br label %bb47
+
+bb84:                                             ; preds = %bb84
+  br i1 %c, label %bb91, label %bb94
+
+bb91:                                             ; preds = %bb90
+  %tmp93 = load i64*, i64** inttoptr (i64 16 to i64**), align 16
+  br label %bb47
+
+bb94:                                            ; preds = %bb94
+  br i1 %c, label %bb101, label %bb104
+
+bb101:                                            ; preds = %bb100
+  %tmp103 = load i64*, i64** inttoptr (i64 16 to i64**), align 16
+  br label %bb47
+
+bb104:                                            ; preds = %bb104
+  br i1 %c, label %bb119, label %bb123
+
+bb119:                                            ; preds = %bb118
+  %tmp122 = load i64*, i64** inttoptr (i64 16 to i64**), align 16
+  br label %bb47
+
+bb123:                                            ; preds = %bb123
+  br i1 %c, label %bb147, label %bb152
+
+bb147:                                            ; preds = %bb146
+  %tmp149 = load i64*, i64** inttoptr (i64 16 to i64**), align 16
+  br label %bb150
+
+bb150:                                            ; preds = %bb152, %bb147
+  %.in = phi i64* [ %tmp184, %bb152 ], [ %tmp149, %bb147 ]
+  br label %bb47
+
+bb152:                                            ; preds = %bb146
+  %tmp184 = load i64*, i64** inttoptr (i64 16 to i64**), align 16
+  call void undef()
+  br label %bb150
+}
+
+; This used to cause an instcombine loop when the problem above was
+; addressed in a non-robust fashion.
+
+%type_1 = type {}
+%type_2 = type {}
+%type_3 = type {}
+
+define void @test_2(i1 %c) local_unnamed_addr {
+; CHECK-LABEL: @test_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[COND_TRUE133:%.*]], label [[COND_FALSE138:%.*]]
+; CHECK:       cond.true133:
+; CHECK-NEXT:    store %type_3* undef, %type_3** null, align 536870912
+; CHECK-NEXT:    br label [[COND_END144:%.*]]
+; CHECK:       cond.false138:
+; CHECK-NEXT:    store %type_3* undef, %type_3** null, align 536870912
+; CHECK-NEXT:    br label [[COND_END144]]
+; CHECK:       cond.end144:
+; CHECK-NEXT:    br label [[WHILE_COND]]
+;
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %cond.end144, %entry
+  %link.0 = phi %type_2* [ undef, %entry ], [ %cond145, %cond.end144 ]
+  %os115 = bitcast %type_2* %link.0 to %type_3*
+  %ou116 = getelementptr inbounds %type_3, %type_3* %os115, i32 0
+  %os1117 = bitcast %type_3* %ou116 to %type_1*
+  br label %for.cond
+
+for.cond:                                         ; preds = %while.cond
+  br i1 %c, label %cond.true133, label %cond.false138
+
+cond.true133:                                     ; preds = %sw.epilog
+  %0 = load %type_2*, %type_2** undef, align 8
+  br label %cond.end144
+
+cond.false138:                                    ; preds = %sw.epilog
+  %1 = load %type_2*, %type_2** undef, align 8
+  br label %cond.end144
+
+cond.end144:                                      ; preds = %cond.false138, %cond.true133
+  %cond145 = phi %type_2* [ %0, %cond.true133 ], [ %1, %cond.false138 ]
+  br label %while.cond
+}

From c6a5418f76c308bcce1c7232e79bac301af81a7a Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <kubak@google.com>
Date: Fri, 20 Dec 2019 16:08:52 -0500
Subject: [PATCH 5/7] [InstCombine] Improve infinite loop detection

Summary:
This patch limits the default number of iterations performed by InstCombine. It also exposes a new option that allows to specify how many iterations is considered getting stuck in an infinite loop.

Based on experiments performed on real-world C++ programs, InstCombine seems to perform at most ~8-20 iterations, so treating 1000 iterations as an infinite loop seems like a safe choice. See D71145 for details.

The two limits can be specified via command line options.

Reviewers: spatel, lebedev.ri, nikic, xbolva00, grosser

Reviewed By: spatel

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71673
---
 .../InstCombine/InstructionCombining.cpp       | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 385f4926b845a..8e8a302c56fe0 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -121,6 +121,9 @@ STATISTIC(NumReassoc  , "Number of reassociations");
 DEBUG_COUNTER(VisitCounter, "instcombine-visit",
               "Controls which instructions are visited");
 
+static constexpr unsigned InstCombineDefaultMaxIterations = 1000;
+static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 1000;
+
 static cl::opt<bool>
 EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
                                               cl::init(true));
@@ -129,6 +132,12 @@ static cl::opt<bool>
 EnableExpensiveCombines("expensive-combines",
                         cl::desc("Enable expensive instruction combines"));
 
+static cl::opt<unsigned> InfiniteLoopDetectionThreshold(
+    "instcombine-infinite-loop-threshold",
+    cl::desc("Number of instruction combining iterations considered an "
+             "infinite loop"),
+    cl::init(InstCombineDefaultInfiniteLoopThreshold), cl::Hidden);
+
 static cl::opt<unsigned>
 MaxArraySize("instcombine-maxarray-size", cl::init(1024),
              cl::desc("Maximum array size considered when doing a combine"));
@@ -3508,9 +3517,16 @@ static bool combineInstructionsOverFunction(
     MadeIRChange = LowerDbgDeclare(F);
 
   // Iterate while there is work to do.
-  int Iteration = 0;
+  unsigned Iteration = 0;
   while (true) {
     ++Iteration;
+
+    if (Iteration > InfiniteLoopDetectionThreshold) {
+      report_fatal_error(
+          "Instruction Combining seems stuck in an infinite loop after " +
+          Twine(InfiniteLoopDetectionThreshold) + " iterations.");
+    }
+
     LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                       << F.getName() << "\n");
 

From f6905cfc468321e555b967106f3555885718da58 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Tue, 31 Dec 2019 12:10:06 +0100
Subject: [PATCH 6/7] [InstCombine] Add tests for PR44242

Differential Revision: https://reviews.llvm.org/D71260
---
 llvm/test/Transforms/InstCombine/pr44242.ll | 192 ++++++++++++++++++++
 1 file changed, 192 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/pr44242.ll

diff --git a/llvm/test/Transforms/InstCombine/pr44242.ll b/llvm/test/Transforms/InstCombine/pr44242.ll
new file mode 100644
index 0000000000000..e478193eb0805
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/pr44242.ll
@@ -0,0 +1,192 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; Check that we don't create two redundant phi nodes when %val is used in a
+; form where we can't rewrite it in terms of the new phi node.
+
+; Use %val in an instruction type not supported by optimizeBitCastFromPhi.
+define float @sitofp(float %x) {
+; CHECK-LABEL: @sitofp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop_header:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VAL_INCR:%.*]], [[LOOP:%.*]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[PHITMP:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[TMP0]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[LOOP]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[VAL_INCR]] = fadd float [[TMP0]], 1.000000e+00
+; CHECK-NEXT:    [[VAL_INCR_CASTED:%.*]] = bitcast float [[VAL_INCR]] to i32
+; CHECK-NEXT:    [[PHITMP]] = sitofp i32 [[VAL_INCR_CASTED]] to float
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       end:
+; CHECK-NEXT:    ret float [[VAL]]
+;
+entry:
+  br label %loop_header
+loop_header:
+  %val = phi i32 [ 0, %entry ], [ %val_incr_casted, %loop ]
+  %val_casted = bitcast i32 %val to float
+  %cmp = fcmp ogt float %val_casted, %x
+  br i1 %cmp, label %end, label %loop
+loop:
+  %val_incr = fadd float %val_casted, 1.0
+  %val_incr_casted = bitcast float %val_incr to i32
+  br label %loop_header
+end:
+  %result = sitofp i32 %val to float
+  ret float %result
+}
+
+; Use %val in an incompatible bitcast.
+define <2 x i16> @bitcast(float %x) {
+; CHECK-LABEL: @bitcast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop_header:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VAL_INCR:%.*]], [[LOOP:%.*]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = phi <2 x i16> [ zeroinitializer, [[ENTRY]] ], [ [[PHITMP:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[TMP0]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[LOOP]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[VAL_INCR]] = fadd float [[TMP0]], 1.000000e+00
+; CHECK-NEXT:    [[PHITMP]] = bitcast float [[VAL_INCR]] to <2 x i16>
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       end:
+; CHECK-NEXT:    ret <2 x i16> [[VAL]]
+;
+entry:
+  br label %loop_header
+loop_header:
+  %val = phi i32 [ 0, %entry ], [ %val_incr_casted, %loop ]
+  %val_casted = bitcast i32 %val to float
+  %cmp = fcmp ogt float %val_casted, %x
+  br i1 %cmp, label %end, label %loop
+loop:
+  %val_incr = fadd float %val_casted, 1.0
+  %val_incr_casted = bitcast float %val_incr to i32
+  br label %loop_header
+end:
+  %result = bitcast i32 %val to <2 x i16>
+  ret <2 x i16> %result
+}
+
+@global = global i32 0
+
+; Use %val with a volatile store.
+define void @store_volatile(float %x) {
+; CHECK-LABEL: @store_volatile(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop_header:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VAL_INCR:%.*]], [[LOOP:%.*]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[VAL_INCR_CASTED:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[TMP0]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[LOOP]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[VAL_INCR]] = fadd float [[TMP0]], 1.000000e+00
+; CHECK-NEXT:    [[VAL_INCR_CASTED]] = bitcast float [[VAL_INCR]] to i32
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       end:
+; CHECK-NEXT:    store volatile i32 [[VAL]], i32* @global, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop_header
+loop_header:
+  %val = phi i32 [ 0, %entry ], [ %val_incr_casted, %loop ]
+  %val_casted = bitcast i32 %val to float
+  %cmp = fcmp ogt float %val_casted, %x
+  br i1 %cmp, label %end, label %loop
+loop:
+  %val_incr = fadd float %val_casted, 1.0
+  %val_incr_casted = bitcast float %val_incr to i32
+  br label %loop_header
+end:
+  store volatile i32 %val, i32* @global
+  ret void
+}
+
+; Use %val with a store where it's actually the address.
+define void @store_address(i32 %x) {
+; CHECK-LABEL: @store_address(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop_header:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi float* [ bitcast (i32* @global to float*), [[ENTRY:%.*]] ], [ [[VAL_INCR:%.*]], [[LOOP:%.*]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32* [ @global, [[ENTRY]] ], [ [[VAL_INCR_CASTED:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[LOOP]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[VAL_INCR]] = getelementptr float, float* [[TMP0]], i64 1
+; CHECK-NEXT:    [[VAL_INCR_CASTED]] = bitcast float* [[VAL_INCR]] to i32*
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       end:
+; CHECK-NEXT:    store i32 0, i32* [[VAL]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop_header
+loop_header:
+  %val = phi i32* [ @global, %entry ], [ %val_incr_casted, %loop ]
+  %i = phi i32 [ 0, %entry ], [ %i_incr, %loop ]
+  %val_casted = bitcast i32* %val to float*
+  %cmp = icmp sgt i32 %i, %x
+  br i1 %cmp, label %end, label %loop
+loop:
+  %i_incr = add i32 %i, 0
+  %val_incr = getelementptr float, float* %val_casted, i32 1
+  %val_incr_casted = bitcast float* %val_incr to i32*
+  br label %loop_header
+end:
+  store i32 0, i32* %val
+  ret void
+}
+
+; Test where a phi (%val2) other than the original one (%val) has an
+; incompatible use.
+define i32 @multiple_phis(float %x) {
+; CHECK-LABEL: @multiple_phis(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop_header:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[TMP1:%.*]], [[LOOP_END:%.*]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[VAL2:%.*]], [[LOOP_END]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[TMP0]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[TMP0]], 2.000000e+00
+; CHECK-NEXT:    br i1 [[CMP2]], label [[IF:%.*]], label [[LOOP_END]]
+; CHECK:       if:
+; CHECK-NEXT:    [[VAL_INCR:%.*]] = fadd float [[TMP0]], 1.000000e+00
+; CHECK-NEXT:    [[VAL_INCR_CASTED:%.*]] = bitcast float [[VAL_INCR]] to i32
+; CHECK-NEXT:    br label [[LOOP_END]]
+; CHECK:       loop_end:
+; CHECK-NEXT:    [[TMP1]] = phi float [ [[TMP0]], [[LOOP]] ], [ [[VAL_INCR]], [[IF]] ]
+; CHECK-NEXT:    [[VAL2]] = phi i32 [ [[VAL]], [[LOOP]] ], [ [[VAL_INCR_CASTED]], [[IF]] ]
+; CHECK-NEXT:    store volatile i32 [[VAL2]], i32* @global, align 4
+; CHECK-NEXT:    br label [[LOOP_HEADER]]
+; CHECK:       end:
+; CHECK-NEXT:    ret i32 [[VAL]]
+;
+entry:
+  br label %loop_header
+loop_header:
+  %val = phi i32 [ 0, %entry ], [ %val2, %loop_end ]
+  %val_casted = bitcast i32 %val to float
+  %cmp = fcmp ogt float %val_casted, %x
+  br i1 %cmp, label %end, label %loop
+loop:
+  %cmp2 = fcmp ogt float %val_casted, 2.0
+  br i1 %cmp2, label %if, label %loop_end
+if:
+  %val_incr = fadd float %val_casted, 1.0
+  %val_incr_casted = bitcast float %val_incr to i32
+  br label %loop_end
+loop_end:
+  %val2 = phi i32 [ %val, %loop ], [ %val_incr_casted, %if ]
+  store volatile i32 %val2, i32* @global ; the incompatible use
+  br label %loop_header
+end:
+  ret i32 %val
+}

From f88b689b7728cc861ad9294a694aa8b992870862 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Tue, 31 Dec 2019 12:11:35 +0100
Subject: [PATCH 7/7] [InstCombine] Don't rewrite phi-of-bitcast when the phi
 has other users

Judging by the existing comments, this was the intention, but the
transform never actually checked if the existing phi's would be removed.
See https://bugs.llvm.org/show_bug.cgi?id=44242 for an example where
this causes much worse code generation on AMDGPU.

Differential Revision: https://reviews.llvm.org/D71209
---
 .../InstCombine/InstCombineCasts.cpp          | 57 ++++++++++++++-----
 llvm/test/Transforms/InstCombine/pr44242.ll   | 52 ++++++++---------
 2 files changed, 68 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 20d6242901eaa..80b0be016da77 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2217,6 +2217,31 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
     }
   }
 
+  // Check that each user of each old PHI node is something that we can
+  // rewrite, so that all of the old PHI nodes can be cleaned up afterwards.
+  for (auto *OldPN : OldPhiNodes) {
+    for (User *V : OldPN->users()) {
+      if (auto *SI = dyn_cast<StoreInst>(V)) {
+        if (!SI->isSimple() || SI->getOperand(0) != OldPN)
+          return nullptr;
+      } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
+        // Verify it's a B->A cast.
+        Type *TyB = BCI->getOperand(0)->getType();
+        Type *TyA = BCI->getType();
+        if (TyA != DestTy || TyB != SrcTy)
+          return nullptr;
+      } else if (auto *PHI = dyn_cast<PHINode>(V)) {
+        // As long as the user is another old PHI node, then even if we don't
+        // rewrite it, the PHI web we're considering won't have any users
+        // outside itself, so it'll be dead.
+        if (OldPhiNodes.count(PHI) == 0)
+          return nullptr;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
   // For each old PHI node, create a corresponding new PHI node with a type A.
   SmallDenseMap<PHINode *, PHINode *> NewPNodes;
   for (auto *OldPN : OldPhiNodes) {
@@ -2269,24 +2294,28 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
       // We may remove this user, advance to avoid iterator invalidation.
       ++It;
       if (auto *SI = dyn_cast<StoreInst>(V)) {
-        if (SI->isSimple() && SI->getOperand(0) == OldPN) {
-          Builder.SetInsertPoint(SI);
-          auto *NewBC =
-            cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy));
-          SI->setOperand(0, NewBC);
-          Worklist.Add(SI);
-          assert(hasStoreUsersOnly(*NewBC));
-        }
+        assert(SI->isSimple() && SI->getOperand(0) == OldPN);
+        Builder.SetInsertPoint(SI);
+        auto *NewBC =
+          cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy));
+        SI->setOperand(0, NewBC);
+        Worklist.Add(SI);
+        assert(hasStoreUsersOnly(*NewBC));
       }
       else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
-        // Verify it's a B->A cast.
         Type *TyB = BCI->getOperand(0)->getType();
         Type *TyA = BCI->getType();
-        if (TyA == DestTy && TyB == SrcTy) {
-          Instruction *I = replaceInstUsesWith(*BCI, NewPN);
-          if (BCI == &CI)
-            RetVal = I;
-        }
+        assert(TyA == DestTy && TyB == SrcTy);
+        (void) TyA;
+        (void) TyB;
+        Instruction *I = replaceInstUsesWith(*BCI, NewPN);
+        if (BCI == &CI)
+          RetVal = I;
+      } else if (auto *PHI = dyn_cast<PHINode>(V)) {
+        assert(OldPhiNodes.count(PHI) > 0);
+        (void) PHI;
+      } else {
+        llvm_unreachable("all uses should be handled");
       }
     }
   }
diff --git a/llvm/test/Transforms/InstCombine/pr44242.ll b/llvm/test/Transforms/InstCombine/pr44242.ll
index e478193eb0805..5e783af734785 100644
--- a/llvm/test/Transforms/InstCombine/pr44242.ll
+++ b/llvm/test/Transforms/InstCombine/pr44242.ll
@@ -10,17 +10,17 @@ define float @sitofp(float %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop_header:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VAL_INCR:%.*]], [[LOOP:%.*]] ]
-; CHECK-NEXT:    [[VAL:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[PHITMP:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[TMP0]], [[X:%.*]]
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VAL_INCR_CASTED:%.*]], [[LOOP:%.*]] ]
+; CHECK-NEXT:    [[VAL_CASTED:%.*]] = bitcast i32 [[VAL]] to float
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[VAL_CASTED]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[LOOP]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[VAL_INCR]] = fadd float [[TMP0]], 1.000000e+00
-; CHECK-NEXT:    [[VAL_INCR_CASTED:%.*]] = bitcast float [[VAL_INCR]] to i32
-; CHECK-NEXT:    [[PHITMP]] = sitofp i32 [[VAL_INCR_CASTED]] to float
+; CHECK-NEXT:    [[VAL_INCR:%.*]] = fadd float [[VAL_CASTED]], 1.000000e+00
+; CHECK-NEXT:    [[VAL_INCR_CASTED]] = bitcast float [[VAL_INCR]] to i32
 ; CHECK-NEXT:    br label [[LOOP_HEADER]]
 ; CHECK:       end:
-; CHECK-NEXT:    ret float [[VAL]]
+; CHECK-NEXT:    [[RESULT:%.*]] = sitofp i32 [[VAL]] to float
+; CHECK-NEXT:    ret float [[RESULT]]
 ;
 entry:
   br label %loop_header
@@ -44,16 +44,17 @@ define <2 x i16> @bitcast(float %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop_header:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VAL_INCR:%.*]], [[LOOP:%.*]] ]
-; CHECK-NEXT:    [[VAL:%.*]] = phi <2 x i16> [ zeroinitializer, [[ENTRY]] ], [ [[PHITMP:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[TMP0]], [[X:%.*]]
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VAL_INCR_CASTED:%.*]], [[LOOP:%.*]] ]
+; CHECK-NEXT:    [[VAL_CASTED:%.*]] = bitcast i32 [[VAL]] to float
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[VAL_CASTED]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[LOOP]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[VAL_INCR]] = fadd float [[TMP0]], 1.000000e+00
-; CHECK-NEXT:    [[PHITMP]] = bitcast float [[VAL_INCR]] to <2 x i16>
+; CHECK-NEXT:    [[VAL_INCR:%.*]] = fadd float [[VAL_CASTED]], 1.000000e+00
+; CHECK-NEXT:    [[VAL_INCR_CASTED]] = bitcast float [[VAL_INCR]] to i32
 ; CHECK-NEXT:    br label [[LOOP_HEADER]]
 ; CHECK:       end:
-; CHECK-NEXT:    ret <2 x i16> [[VAL]]
+; CHECK-NEXT:    [[RESULT:%.*]] = bitcast i32 [[VAL]] to <2 x i16>
+; CHECK-NEXT:    ret <2 x i16> [[RESULT]]
 ;
 entry:
   br label %loop_header
@@ -79,12 +80,12 @@ define void @store_volatile(float %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop_header:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VAL_INCR:%.*]], [[LOOP:%.*]] ]
-; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[VAL_INCR_CASTED:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[TMP0]], [[X:%.*]]
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VAL_INCR_CASTED:%.*]], [[LOOP:%.*]] ]
+; CHECK-NEXT:    [[VAL_CASTED:%.*]] = bitcast i32 [[VAL]] to float
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[VAL_CASTED]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[LOOP]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[VAL_INCR]] = fadd float [[TMP0]], 1.000000e+00
+; CHECK-NEXT:    [[VAL_INCR:%.*]] = fadd float [[VAL_CASTED]], 1.000000e+00
 ; CHECK-NEXT:    [[VAL_INCR_CASTED]] = bitcast float [[VAL_INCR]] to i32
 ; CHECK-NEXT:    br label [[LOOP_HEADER]]
 ; CHECK:       end:
@@ -113,13 +114,11 @@ define void @store_address(i32 %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop_header:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi float* [ bitcast (i32* @global to float*), [[ENTRY:%.*]] ], [ [[VAL_INCR:%.*]], [[LOOP:%.*]] ]
-; CHECK-NEXT:    [[VAL:%.*]] = phi i32* [ @global, [[ENTRY]] ], [ [[VAL_INCR_CASTED:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32* [ @global, [[ENTRY:%.*]] ], [ [[VAL_INCR1:%.*]], [[LOOP:%.*]] ]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[LOOP]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[VAL_INCR]] = getelementptr float, float* [[TMP0]], i64 1
-; CHECK-NEXT:    [[VAL_INCR_CASTED]] = bitcast float* [[VAL_INCR]] to i32*
+; CHECK-NEXT:    [[VAL_INCR1]] = getelementptr i32, i32* [[VAL]], i64 1
 ; CHECK-NEXT:    br label [[LOOP_HEADER]]
 ; CHECK:       end:
 ; CHECK-NEXT:    store i32 0, i32* [[VAL]], align 4
@@ -150,19 +149,18 @@ define i32 @multiple_phis(float %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop_header:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[TMP1:%.*]], [[LOOP_END:%.*]] ]
-; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[VAL2:%.*]], [[LOOP_END]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[TMP0]], [[X:%.*]]
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VAL2:%.*]], [[LOOP_END:%.*]] ]
+; CHECK-NEXT:    [[VAL_CASTED:%.*]] = bitcast i32 [[VAL]] to float
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[VAL_CASTED]], [[X:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[TMP0]], 2.000000e+00
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[VAL_CASTED]], 2.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[IF:%.*]], label [[LOOP_END]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[VAL_INCR:%.*]] = fadd float [[TMP0]], 1.000000e+00
+; CHECK-NEXT:    [[VAL_INCR:%.*]] = fadd float [[VAL_CASTED]], 1.000000e+00
 ; CHECK-NEXT:    [[VAL_INCR_CASTED:%.*]] = bitcast float [[VAL_INCR]] to i32
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       loop_end:
-; CHECK-NEXT:    [[TMP1]] = phi float [ [[TMP0]], [[LOOP]] ], [ [[VAL_INCR]], [[IF]] ]
 ; CHECK-NEXT:    [[VAL2]] = phi i32 [ [[VAL]], [[LOOP]] ], [ [[VAL_INCR_CASTED]], [[IF]] ]
 ; CHECK-NEXT:    store volatile i32 [[VAL2]], i32* @global, align 4
 ; CHECK-NEXT:    br label [[LOOP_HEADER]]