From 6c570f789dc827960f3b8f6acddf0a07a7746673 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 16 May 2020 10:41:35 -0400
Subject: [PATCH 01/24] GlobalISel: Add G_EXTRACT/G_INSERT offset to legalize
 info

Immediate legalize fields were added for G_SEXT_INREG. Simiarly, these
are likely not legal except for certain offsets.
---
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h       | 15 +++++++++++++++
 llvm/include/llvm/Target/GenericOpcodes.td        |  4 ++--
 .../GlobalISel/legalizer-info-validation.mir      |  4 ++--
 .../CodeGen/GlobalISel/LegalizerHelperTest.cpp    |  2 +-
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 49bc66a89a219..b3c7b68422582 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -459,6 +459,14 @@ class LegalizeRuleSet {
     immIdx(0); // Inform verifier imm idx 0 is handled.
     return actionIf(Action, typeInSet(typeIdx(0), Types));
   }
+
+  LegalizeRuleSet &actionForTypeWithAnyImm(
+    LegalizeAction Action, std::initializer_list<std::pair<LLT, LLT>> Types) {
+    using namespace LegalityPredicates;
+    immIdx(0); // Inform verifier imm idx 0 is handled.
+    return actionIf(Action, typePairInSet(typeIdx(0), typeIdx(1), Types));
+  }
+
   /// Use the given action when type indexes 0 and 1 are both in the given list.
   /// That is, the type pair is in the cartesian product of the list.
   /// Action should not be an action that requires mutation.
@@ -528,6 +536,13 @@ class LegalizeRuleSet {
     markAllIdxsAsCovered();
     return actionForTypeWithAnyImm(LegalizeAction::Legal, Types);
   }
+
+  LegalizeRuleSet &legalForTypeWithAnyImm(
+    std::initializer_list<std::pair<LLT, LLT>> Types) {
+    markAllIdxsAsCovered();
+    return actionForTypeWithAnyImm(LegalizeAction::Legal, Types);
+  }
+
   /// The instruction is legal when type indexes 0 and 1 along with the memory
   /// size and minimum alignment is any type and size tuple in the given list.
   LegalizeRuleSet &legalForTypesWithMemDesc(
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 79b965e3fef07..eafcb3d96ff3f 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -974,7 +974,7 @@ def G_FENCE : GenericInstruction {
 // register banks have been selected.
 def G_EXTRACT : GenericInstruction {
   let OutOperandList = (outs type0:$res);
-  let InOperandList = (ins type1:$src, unknown:$offset);
+  let InOperandList = (ins type1:$src, untyped_imm_0:$offset);
   let hasSideEffects = 0;
 }
 
@@ -993,7 +993,7 @@ def G_UNMERGE_VALUES : GenericInstruction {
 // Insert a smaller register into a larger one at the specified bit-index.
 def G_INSERT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
-  let InOperandList = (ins type0:$src, type1:$op, unknown:$offset);
+  let InOperandList = (ins type0:$src, type1:$op, untyped_imm_0:$offset);
   let hasSideEffects = 0;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 769144102434c..35b41b8aaa871 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -76,7 +76,7 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
-# DEBUG-NEXT: G_EXTRACT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: G_EXTRACT (opcode {{[0-9]+}}): 2 type indices, 1 imm index
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
@@ -84,7 +84,7 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
-# DEBUG-NEXT: G_INSERT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: G_INSERT (opcode {{[0-9]+}}): 2 type indices, 1 imm index
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
index 817d707776cb7..93f4f703d239b 100644
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
@@ -2311,7 +2311,7 @@ TEST_F(AArch64GISelMITest, NarrowScalarExtract) {
   // Declare your legalization info
   DefineLegalizerInfo(A, {
     getActionDefinitionsBuilder(G_UNMERGE_VALUES).legalFor({{s32, s64}});
-    getActionDefinitionsBuilder(G_EXTRACT).legalFor({{s16, s32}});
+    getActionDefinitionsBuilder(G_EXTRACT).legalForTypeWithAnyImm({{s16, s32}});
   });
 
   LLT S16{LLT::scalar(16)};

From 45e1a22a92bf2c33336ccc02ea4fa3996f60252b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 3 Jun 2020 22:06:49 -0400
Subject: [PATCH 02/24] GlobalISel: Make known bits/alignment API more
 consistent

Just computing the alignment makes sense without caring about the
general known bits, such as for non-integral pointers. Separate the
two and start calling into the TargetLowering hooks for frame indexes.

Start calling the TargetLowering implementation for FrameIndexes,
which improves the AMDGPU matching for stack addressing modes. Also
introduce a new hook for returning known alignment of target
instructions. For AMDGPU, it would be useful to report the known
alignment implied by certain intrinsic calls.

Also stop using MaybeAlign.
---
 .../llvm/CodeGen/GlobalISel/GISelKnownBits.h  | 16 +++----
 llvm/include/llvm/CodeGen/TargetLowering.h    |  9 ++++
 .../lib/CodeGen/GlobalISel/GISelKnownBits.cpp | 42 +++++++------------
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  6 +++
 .../GlobalISel/inst-select-load-private.mir   |  5 +--
 .../GlobalISel/inst-select-store-private.mir  | 14 ++-----
 6 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
index 976d42d588462..0f76abff86f97 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
@@ -69,18 +69,14 @@ class GISelKnownBits : public GISelChangeObserver {
   /// predicate to simplify operations downstream.
   bool signBitIsZero(Register Op);
 
-  // FIXME: Is this the right place for G_FRAME_INDEX? Should it be in
-  // TargetLowering?
-  void computeKnownBitsForFrameIndex(Register R, KnownBits &Known,
-                                     const APInt &DemandedElts,
-                                     unsigned Depth = 0);
-  static Align inferAlignmentForFrameIdx(int FrameIdx, int Offset,
-                                         const MachineFunction &MF);
   static void computeKnownBitsForAlignment(KnownBits &Known,
-                                           MaybeAlign Alignment);
+                                           Align Alignment) {
+    // The low bits are known zero if the pointer is aligned.
+    Known.Zero.setLowBits(Log2(Alignment));
+  }
 
-  // Try to infer alignment for MI.
-  static MaybeAlign inferPtrAlignment(const MachineInstr &MI);
+  /// \return The known alignment for the pointer-like value \p R.
+  Align computeKnownAlignment(Register R, unsigned Depth = 0);
 
   // Observer API. No-op for non-caching implementation.
   void erasingInstr(MachineInstr &MI) override{};
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index b3c3bcadc4cd9..04582a8634c76 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3348,6 +3348,15 @@ class TargetLowering : public TargetLoweringBase {
                                               const MachineRegisterInfo &MRI,
                                               unsigned Depth = 0) const;
 
+  /// Determine the known alignment for the pointer value \p R. This is can
+  /// typically be inferred from the number of low known 0 bits. However, for a
+  /// pointer with a non-integral address space, the alignment value may be
+  /// independent from the known low bits.
+  virtual Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis,
+                                                Register R,
+                                                const MachineRegisterInfo &MRI,
+                                                unsigned Depth = 0) const;
+
   /// Determine which of the bits of FrameIndex \p FIOp are known to be 0.
   /// Default implementation computes low bits based on alignment
   /// information. This should preserve known bits passed into it.
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 5c6e4dd645645..3a4a373a6c73f 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -31,36 +31,23 @@ GISelKnownBits::GISelKnownBits(MachineFunction &MF, unsigned MaxDepth)
     : MF(MF), MRI(MF.getRegInfo()), TL(*MF.getSubtarget().getTargetLowering()),
       DL(MF.getFunction().getParent()->getDataLayout()), MaxDepth(MaxDepth) {}
 
-Align GISelKnownBits::inferAlignmentForFrameIdx(int FrameIdx, int Offset,
-                                                const MachineFunction &MF) {
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  return commonAlignment(MFI.getObjectAlign(FrameIdx), Offset);
-  // TODO: How to handle cases with Base + Offset?
-}
-
-MaybeAlign GISelKnownBits::inferPtrAlignment(const MachineInstr &MI) {
-  if (MI.getOpcode() == TargetOpcode::G_FRAME_INDEX) {
-    int FrameIdx = MI.getOperand(1).getIndex();
-    return inferAlignmentForFrameIdx(FrameIdx, 0, *MI.getMF());
+Align GISelKnownBits::computeKnownAlignment(Register R, unsigned Depth) {
+  const MachineInstr *MI = MRI.getVRegDef(R);
+  switch (MI->getOpcode()) {
+  case TargetOpcode::G_FRAME_INDEX: {
+    int FrameIdx = MI->getOperand(1).getIndex();
+    return MF.getFrameInfo().getObjectAlign(FrameIdx);
+  }
+  case TargetOpcode::G_INTRINSIC:
+  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+  default:
+    return TL.computeKnownAlignForTargetInstr(*this, R, MRI, Depth + 1);
   }
-  return None;
-}
-
-void GISelKnownBits::computeKnownBitsForFrameIndex(Register R, KnownBits &Known,
-                                                   const APInt &DemandedElts,
-                                                   unsigned Depth) {
-  const MachineInstr &MI = *MRI.getVRegDef(R);
-  computeKnownBitsForAlignment(Known, inferPtrAlignment(MI));
-}
-
-void GISelKnownBits::computeKnownBitsForAlignment(KnownBits &Known,
-                                                  MaybeAlign Alignment) {
-  if (Alignment)
-    // The low bits are known zero if the pointer is aligned.
-    Known.Zero.setLowBits(Log2(*Alignment));
 }
 
 KnownBits GISelKnownBits::getKnownBits(MachineInstr &MI) {
+  assert(MI.getNumExplicitDefs() == 1 &&
+         "expected single return generic instruction");
   return getKnownBits(MI.getOperand(0).getReg());
 }
 
@@ -215,7 +202,8 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     break;
   }
   case TargetOpcode::G_FRAME_INDEX: {
-    computeKnownBitsForFrameIndex(R, Known, DemandedElts);
+    int FrameIdx = MI.getOperand(1).getIndex();
+    TL.computeKnownBitsForFrameIndex(FrameIdx, Known, MF);
     break;
   }
   case TargetOpcode::G_SUB: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4a0537224beed..d231d35165389 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2832,6 +2832,12 @@ void TargetLowering::computeKnownBitsForFrameIndex(
   Known.Zero.setLowBits(Log2(MF.getFrameInfo().getObjectAlign(FrameIdx)));
 }
 
+Align TargetLowering::computeKnownAlignForTargetInstr(
+  GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI,
+  unsigned Depth) const {
+  return Align(1);
+}
+
 /// This method can be implemented by targets that want to expose additional
 /// information about sign bits to the DAG Combiner.
 unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
index 79284fdfd05f7..9974e6b2bf651 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
@@ -793,10 +793,7 @@ body: |
   bb.0:
 
     ; GFX6-LABEL: name: load_private_s32_from_1_fi_offset_4095
-    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
-    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
+    ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
     ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4095
     ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
index 9ac43878862c0..f58ffe784e3ca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir
@@ -205,11 +205,8 @@ body: |
   bb.0:
 
     ; GFX6-LABEL: name: function_store_private_s32_to_1_fi_offset_4095
-    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
-    ; GFX6: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     ; GFX9-LABEL: name: function_store_private_s32_to_1_fi_offset_4095
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
@@ -477,11 +474,8 @@ body: |
 
     ; GFX6-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095
     ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
-    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
-    ; GFX6: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5)
     ; GFX9-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec

From 3d7b926dd16ff9ff264337793b76080681636f15 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 4 Jun 2020 17:57:11 -0700
Subject: [PATCH 03/24] Move GetXcode*Directory into HostInfo (NFC)

These functions really don't belong into PlatformDarwin, since they
actualy query state of the Host and not of the remote platform.
---
 lldb/include/lldb/Host/HostInfoBase.h         |  3 +
 .../include/lldb/Host/macosx/HostInfoMacOSX.h |  3 +-
 lldb/include/lldb/Utility/XcodeSDK.h          |  2 +
 .../Host/macosx/objcxx/HostInfoMacOSX.mm      | 57 +++++++++++++
 .../MacOSX/PlatformAppleSimulator.cpp         |  9 +-
 .../MacOSX/PlatformAppleTVSimulator.cpp       |  2 +-
 .../MacOSX/PlatformAppleWatchSimulator.cpp    |  2 +-
 .../Platform/MacOSX/PlatformDarwin.cpp        | 83 +------------------
 .../Plugins/Platform/MacOSX/PlatformDarwin.h  |  4 -
 .../Platform/MacOSX/PlatformDarwinKernel.cpp  |  3 +-
 .../Platform/MacOSX/PlatformMacOSX.cpp        |  2 +-
 .../MacOSX/PlatformRemoteDarwinDevice.cpp     |  3 +-
 .../Platform/MacOSX/PlatformiOSSimulator.cpp  |  2 +-
 lldb/source/Utility/XcodeSDK.cpp              | 22 +++++
 .../unittests/Platform/PlatformDarwinTest.cpp | 39 ---------
 lldb/unittests/Utility/XcodeSDKTest.cpp       | 35 ++++++++
 16 files changed, 136 insertions(+), 135 deletions(-)

diff --git a/lldb/include/lldb/Host/HostInfoBase.h b/lldb/include/lldb/Host/HostInfoBase.h
index dbd11505c21bb..70682c9b685eb 100644
--- a/lldb/include/lldb/Host/HostInfoBase.h
+++ b/lldb/include/lldb/Host/HostInfoBase.h
@@ -92,6 +92,9 @@ class HostInfoBase {
   static bool ComputePathRelativeToLibrary(FileSpec &file_spec,
                                            llvm::StringRef dir);
 
+  static FileSpec GetXcodeContentsDirectory() { return {}; }
+  static FileSpec GetXcodeDeveloperDirectory() { return {}; }
+  
   /// Return the directory containing a specific Xcode SDK.
   static llvm::StringRef GetXcodeSDKPath(XcodeSDK sdk) { return {}; }
 
diff --git a/lldb/include/lldb/Host/macosx/HostInfoMacOSX.h b/lldb/include/lldb/Host/macosx/HostInfoMacOSX.h
index dacb8c40f0fb0..3941414f8abdd 100644
--- a/lldb/include/lldb/Host/macosx/HostInfoMacOSX.h
+++ b/lldb/include/lldb/Host/macosx/HostInfoMacOSX.h
@@ -32,7 +32,8 @@ class HostInfoMacOSX : public HostInfoPosix {
   static bool GetOSBuildString(std::string &s);
   static bool GetOSKernelDescription(std::string &s);
   static FileSpec GetProgramFileSpec();
-  static std::string FindXcodeContentsDirectoryInPath(llvm::StringRef path);
+  static FileSpec GetXcodeContentsDirectory();
+  static FileSpec GetXcodeDeveloperDirectory();
 
   /// Query xcrun to find an Xcode SDK directory.
   static llvm::StringRef GetXcodeSDKPath(XcodeSDK sdk);
diff --git a/lldb/include/lldb/Utility/XcodeSDK.h b/lldb/include/lldb/Utility/XcodeSDK.h
index 2ed5fab1c941f..307fe7f46798e 100644
--- a/lldb/include/lldb/Utility/XcodeSDK.h
+++ b/lldb/include/lldb/Utility/XcodeSDK.h
@@ -87,6 +87,8 @@ class XcodeSDK {
   static std::string GetCanonicalName(Info info);
   /// Return the best-matching SDK type for a specific triple.
   static XcodeSDK::Type GetSDKTypeForTriple(const llvm::Triple &triple);
+
+  static std::string FindXcodeContentsDirectoryInPath(llvm::StringRef path);
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
index 615f77b2dbcc3..5c459a0413837 100644
--- a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
+++ b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
@@ -297,6 +297,63 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) {
   }
 }
 
+FileSpec HostInfoMacOSX::GetXcodeContentsDirectory() {
+  static FileSpec g_xcode_contents_path;
+  static std::once_flag g_once_flag;
+  std::call_once(g_once_flag, [&]() {
+    // Try the shlib dir first.
+    if (FileSpec fspec = HostInfo::GetShlibDir()) {
+      if (FileSystem::Instance().Exists(fspec)) {
+        std::string xcode_contents_dir =
+            XcodeSDK::FindXcodeContentsDirectoryInPath(fspec.GetPath());
+        if (!xcode_contents_dir.empty()) {
+          g_xcode_contents_path = FileSpec(xcode_contents_dir);
+          return;
+        }
+      }
+    }
+
+    if (const char *developer_dir_env_var = getenv("DEVELOPER_DIR")) {
+      FileSpec fspec(developer_dir_env_var);
+      if (FileSystem::Instance().Exists(fspec)) {
+        // FIXME: This looks like it couldn't possibly work!
+        std::string xcode_contents_dir =
+            XcodeSDK::FindXcodeContentsDirectoryInPath(fspec.GetPath());
+        if (!xcode_contents_dir.empty()) {
+          g_xcode_contents_path = FileSpec(xcode_contents_dir);
+          return;
+        }
+      }
+    }
+
+    FileSpec fspec(HostInfo::GetXcodeSDKPath(XcodeSDK::GetAnyMacOS()));
+    if (fspec) {
+      if (FileSystem::Instance().Exists(fspec)) {
+        std::string xcode_contents_dir =
+            XcodeSDK::FindXcodeContentsDirectoryInPath(fspec.GetPath());
+        if (!xcode_contents_dir.empty()) {
+          g_xcode_contents_path = FileSpec(xcode_contents_dir);
+          return;
+        }
+      }
+    }
+  });
+  return g_xcode_contents_path;
+}
+
+lldb_private::FileSpec HostInfoMacOSX::GetXcodeDeveloperDirectory() {
+  static lldb_private::FileSpec g_developer_directory;
+  static llvm::once_flag g_once_flag;
+  llvm::call_once(g_once_flag, []() {
+    if (FileSpec fspec = GetXcodeContentsDirectory()) {
+      fspec.AppendPathComponent("Developer");
+      if (FileSystem::Instance().Exists(fspec))
+        g_developer_directory = fspec;
+    }
+  });
+  return g_developer_directory;
+}
+
 static std::string GetXcodeSDK(XcodeSDK sdk) {
   XcodeSDK::Info info = sdk.Parse();
   std::string sdk_name = XcodeSDK::GetCanonicalName(info);
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp
index cb6fbce19e589..3f83a23de8df5 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp
@@ -15,6 +15,7 @@
 #include <mutex>
 #include <thread>
 #include "lldb/Host/PseudoTerminal.h"
+#include "lldb/Host/HostInfo.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Utility/LLDBAssert.h"
 #include "lldb/Utility/Status.h"
@@ -77,7 +78,7 @@ void PlatformAppleSimulator::GetStatus(Stream &strm) {
   // simulator
   PlatformAppleSimulator::LoadCoreSimulator();
 
-  std::string developer_dir = GetXcodeDeveloperDirectory().GetPath();
+  std::string developer_dir = HostInfo::GetXcodeDeveloperDirectory().GetPath();
   CoreSimulatorSupport::DeviceSet devices =
       CoreSimulatorSupport::DeviceSet::GetAvailableDevices(
           developer_dir.c_str());
@@ -124,7 +125,7 @@ Status PlatformAppleSimulator::ConnectRemote(Args &args) {
     const char *arg_cstr = args.GetArgumentAtIndex(0);
     if (arg_cstr) {
       std::string arg_str(arg_cstr);
-      std::string developer_dir = GetXcodeDeveloperDirectory().GetPath();
+      std::string developer_dir = HostInfo::GetXcodeDeveloperDirectory().GetPath();
       CoreSimulatorSupport::DeviceSet devices =
           CoreSimulatorSupport::DeviceSet::GetAvailableDevices(
               developer_dir.c_str());
@@ -214,7 +215,7 @@ FileSpec PlatformAppleSimulator::GetCoreSimulatorPath() {
 #if defined(__APPLE__)
   std::lock_guard<std::mutex> guard(m_core_sim_path_mutex);
   if (!m_core_simulator_framework_path.hasValue()) {
-    if (FileSpec fspec = GetXcodeDeveloperDirectory()) {
+    if (FileSpec fspec = HostInfo::GetXcodeDeveloperDirectory()) {
       std::string developer_dir = fspec.GetPath();
       StreamString cs_path;
       cs_path.Printf(
@@ -247,7 +248,7 @@ CoreSimulatorSupport::Device PlatformAppleSimulator::GetSimulatorDevice() {
   if (!m_device.hasValue()) {
     const CoreSimulatorSupport::DeviceType::ProductFamilyID dev_id =
         CoreSimulatorSupport::DeviceType::ProductFamilyID::iPhone;
-    std::string developer_dir = GetXcodeDeveloperDirectory().GetPath();
+    std::string developer_dir = HostInfo::GetXcodeDeveloperDirectory().GetPath();
     m_device = CoreSimulatorSupport::DeviceSet::GetAvailableDevices(
                    developer_dir.c_str())
                    .GetFanciest(dev_id);
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp
index fa5f93ba14887..82114c7094814 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleTVSimulator.cpp
@@ -255,7 +255,7 @@ EnumerateDirectoryCallback(void *baton, llvm::sys::fs::file_type ft,
 const char *PlatformAppleTVSimulator::GetSDKDirectoryAsCString() {
   std::lock_guard<std::mutex> guard(m_sdk_dir_mutex);
   if (m_sdk_directory.empty()) {
-    if (FileSpec fspec = GetXcodeDeveloperDirectory()) {
+    if (FileSpec fspec = HostInfo::GetXcodeDeveloperDirectory()) {
       std::string developer_dir = fspec.GetPath();
       char sdks_directory[PATH_MAX];
       char sdk_dirname[PATH_MAX];
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp
index 6cd20164ae78d..c345c0cf3f6f2 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleWatchSimulator.cpp
@@ -255,7 +255,7 @@ EnumerateDirectoryCallback(void *baton, llvm::sys::fs::file_type ft,
 const char *PlatformAppleWatchSimulator::GetSDKDirectoryAsCString() {
   std::lock_guard<std::mutex> guard(m_sdk_dir_mutex);
   if (m_sdk_directory.empty()) {
-    if (FileSpec fspec = GetXcodeDeveloperDirectory()) {
+    if (FileSpec fspec = HostInfo::GetXcodeDeveloperDirectory()) {
       std::string developer_dir = fspec.GetPath();
       char sdks_directory[PATH_MAX];
       char sdk_dirname[PATH_MAX];
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index 635dd1f059710..85cd000d8787c 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -1133,19 +1133,6 @@ static FileSpec GetXcodeSelectPath() {
   return g_xcode_select_filespec;
 }
 
-lldb_private::FileSpec PlatformDarwin::GetXcodeDeveloperDirectory() {
-  static lldb_private::FileSpec g_developer_directory;
-  static llvm::once_flag g_once_flag;
-  llvm::call_once(g_once_flag, []() {
-    if (FileSpec fspec = GetXcodeContentsDirectory()) {
-      fspec.AppendPathComponent("Developer");
-      if (FileSystem::Instance().Exists(fspec))
-        g_developer_directory = fspec;
-    }
-  });
-  return g_developer_directory;
-}
-
 BreakpointSP PlatformDarwin::SetThreadCreationBreakpoint(Target &target) {
   BreakpointSP bp_sp;
   static const char *g_bp_names[] = {
@@ -1260,7 +1247,7 @@ FileSpec PlatformDarwin::FindSDKInXcodeForModules(XcodeSDK::Type sdk_type,
 }
 
 FileSpec PlatformDarwin::GetSDKDirectoryForModules(XcodeSDK::Type sdk_type) {
-  FileSpec sdks_spec = GetXcodeContentsDirectory();
+  FileSpec sdks_spec = HostInfo::GetXcodeContentsDirectory();
   sdks_spec.AppendPathComponent("Developer");
   sdks_spec.AppendPathComponent("Platforms");
 
@@ -1586,7 +1573,7 @@ lldb_private::FileSpec PlatformDarwin::LocateExecutable(const char *basename) {
   llvm::call_once(g_once_flag, []() {
 
     // When locating executables, trust the DEVELOPER_DIR first if it is set
-    FileSpec xcode_contents_dir = GetXcodeContentsDirectory();
+    FileSpec xcode_contents_dir = HostInfo::GetXcodeContentsDirectory();
     if (xcode_contents_dir) {
       FileSpec xcode_lldb_resources = xcode_contents_dir;
       xcode_lldb_resources.AppendPathComponent("SharedFrameworks");
@@ -1738,72 +1725,6 @@ std::string PlatformDarwin::FindComponentInPath(llvm::StringRef path,
   return {};
 }
 
-std::string
-PlatformDarwin::FindXcodeContentsDirectoryInPath(llvm::StringRef path) {
-  auto begin = llvm::sys::path::begin(path);
-  auto end = llvm::sys::path::end(path);
-
-  // Iterate over the path components until we find something that ends with
-  // .app. If the next component is Contents then we've found the Contents
-  // directory.
-  for (auto it = begin; it != end; ++it) {
-    if (it->endswith(".app")) {
-      auto next = it;
-      if (++next != end && *next == "Contents") {
-        llvm::SmallString<128> buffer;
-        llvm::sys::path::append(buffer, begin, ++next,
-                                llvm::sys::path::Style::posix);
-        return buffer.str().str();
-      }
-    }
-  }
-
-  return {};
-}
-
-FileSpec PlatformDarwin::GetXcodeContentsDirectory() {
-  static FileSpec g_xcode_contents_path;
-  static std::once_flag g_once_flag;
-  std::call_once(g_once_flag, [&]() {
-    // Try the shlib dir first.
-    if (FileSpec fspec = HostInfo::GetShlibDir()) {
-      if (FileSystem::Instance().Exists(fspec)) {
-        std::string xcode_contents_dir =
-            FindXcodeContentsDirectoryInPath(fspec.GetPath());
-        if (!xcode_contents_dir.empty()) {
-          g_xcode_contents_path = FileSpec(xcode_contents_dir);
-          return;
-        }
-      }
-    }
-
-    if (const char *developer_dir_env_var = getenv("DEVELOPER_DIR")) {
-      FileSpec fspec(developer_dir_env_var);
-      if (FileSystem::Instance().Exists(fspec)) {
-        std::string xcode_contents_dir =
-            FindXcodeContentsDirectoryInPath(fspec.GetPath());
-        if (!xcode_contents_dir.empty()) {
-          g_xcode_contents_path = FileSpec(xcode_contents_dir);
-          return;
-        }
-      }
-    }
-
-    FileSpec fspec(HostInfo::GetXcodeSDKPath(XcodeSDK::GetAnyMacOS()));
-    if (fspec) {
-      if (FileSystem::Instance().Exists(fspec)) {
-        std::string xcode_contents_dir =
-            FindXcodeContentsDirectoryInPath(fspec.GetPath());
-        if (!xcode_contents_dir.empty()) {
-          g_xcode_contents_path = FileSpec(xcode_contents_dir);
-          return;
-        }
-      }
-    }
-  });
-  return g_xcode_contents_path;
-}
-
 FileSpec PlatformDarwin::GetCurrentToolchainDirectory() {
   if (FileSpec fspec = HostInfo::GetShlibDir())
     return FileSpec(FindComponentInPath(fspec.GetPath(), ".xctoolchain"));
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h
index f93f8f4ffc72c..8e28a70003106 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h
@@ -89,9 +89,6 @@ class PlatformDarwin : public PlatformPOSIX {
   llvm::Expected<lldb_private::StructuredData::DictionarySP>
   FetchExtendedCrashInformation(lldb_private::Process &process) override;
 
-  static lldb_private::FileSpec GetXcodeContentsDirectory();
-  static lldb_private::FileSpec GetXcodeDeveloperDirectory();
-
   /// Return the toolchain directory the current LLDB instance is located in.
   static lldb_private::FileSpec GetCurrentToolchainDirectory();
 
@@ -165,7 +162,6 @@ class PlatformDarwin : public PlatformPOSIX {
 
   static std::string FindComponentInPath(llvm::StringRef path,
                                          llvm::StringRef component);
-  static std::string FindXcodeContentsDirectoryInPath(llvm::StringRef path);
 
   std::string m_developer_directory;
   llvm::StringMap<std::string> m_sdk_path;
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
index 6d9f20a773695..f6c0f262a3798 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
@@ -17,6 +17,7 @@
 #include "lldb/Core/ModuleSpec.h"
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Host/Host.h"
+#include "lldb/Host/HostInfo.h"
 #include "lldb/Interpreter/OptionValueFileSpecList.h"
 #include "lldb/Interpreter/OptionValueProperties.h"
 #include "lldb/Interpreter/Property.h"
@@ -327,7 +328,7 @@ void PlatformDarwinKernel::CollectKextAndKernelDirectories() {
 
   // DeveloperDirectory is something like
   // "/Applications/Xcode.app/Contents/Developer"
-  std::string developer_dir = GetXcodeDeveloperDirectory().GetPath();
+  std::string developer_dir = HostInfo::GetXcodeDeveloperDirectory().GetPath();
   if (developer_dir.empty())
     developer_dir = "/Applications/Xcode.app/Contents/Developer";
 
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
index 84bc85e53c3b2..0b7f898ee0d33 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
@@ -197,7 +197,7 @@ ConstString PlatformMacOSX::GetSDKDirectory(lldb_private::Target &target) {
     return {};
 
   // First try to find an SDK that matches the given SDK version.
-  if (FileSpec fspec = GetXcodeContentsDirectory()) {
+  if (FileSpec fspec = HostInfo::GetXcodeContentsDirectory()) {
     StreamString sdk_path;
     sdk_path.Printf("%s/Developer/Platforms/MacOSX.platform/Developer/"
                     "SDKs/MacOSX%u.%u.sdk",
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
index 40dd903201519..e4ede0dc638b0 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
@@ -15,6 +15,7 @@
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Host/Host.h"
+#include "lldb/Host/HostInfo.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/FileSpec.h"
@@ -342,7 +343,7 @@ PlatformRemoteDarwinDevice::GetSDKDirectoryForLatestOSVersion() {
 const char *PlatformRemoteDarwinDevice::GetDeviceSupportDirectory() {
   std::string platform_dir = "/Platforms/" + GetPlatformName() + "/DeviceSupport";
   if (m_device_support_directory.empty()) {
-    if (FileSpec fspec = GetXcodeDeveloperDirectory()) {
+    if (FileSpec fspec = HostInfo::GetXcodeDeveloperDirectory()) {
       m_device_support_directory = fspec.GetPath();
       m_device_support_directory.append(platform_dir.c_str());
     } else {
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp
index b09c4051e2855..1a3e087148642 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformiOSSimulator.cpp
@@ -260,7 +260,7 @@ EnumerateDirectoryCallback(void *baton, llvm::sys::fs::file_type ft,
 const char *PlatformiOSSimulator::GetSDKDirectoryAsCString() {
   std::lock_guard<std::mutex> guard(m_sdk_dir_mutex);
   if (m_sdk_directory.empty()) {
-    if (FileSpec fspec = GetXcodeDeveloperDirectory()) {
+    if (FileSpec fspec = HostInfo::GetXcodeDeveloperDirectory()) {
       std::string developer_dir = fspec.GetPath();
       char sdks_directory[PATH_MAX];
       char sdk_dirname[PATH_MAX];
diff --git a/lldb/source/Utility/XcodeSDK.cpp b/lldb/source/Utility/XcodeSDK.cpp
index d5ef8e2951ff9..066bf457966c9 100644
--- a/lldb/source/Utility/XcodeSDK.cpp
+++ b/lldb/source/Utility/XcodeSDK.cpp
@@ -285,3 +285,25 @@ XcodeSDK::Type XcodeSDK::GetSDKTypeForTriple(const llvm::Triple &triple) {
     return XcodeSDK::unknown;
   }
 }
+
+std::string XcodeSDK::FindXcodeContentsDirectoryInPath(llvm::StringRef path) {
+  auto begin = llvm::sys::path::begin(path);
+  auto end = llvm::sys::path::end(path);
+
+  // Iterate over the path components until we find something that ends with
+  // .app. If the next component is Contents then we've found the Contents
+  // directory.
+  for (auto it = begin; it != end; ++it) {
+    if (it->endswith(".app")) {
+      auto next = it;
+      if (++next != end && *next == "Contents") {
+        llvm::SmallString<128> buffer;
+        llvm::sys::path::append(buffer, begin, ++next,
+                                llvm::sys::path::Style::posix);
+        return buffer.str().str();
+      }
+    }
+  }
+
+  return {};
+}
diff --git a/lldb/unittests/Platform/PlatformDarwinTest.cpp b/lldb/unittests/Platform/PlatformDarwinTest.cpp
index 0a4c802c30bb1..285dc2ee3db78 100644
--- a/lldb/unittests/Platform/PlatformDarwinTest.cpp
+++ b/lldb/unittests/Platform/PlatformDarwinTest.cpp
@@ -20,7 +20,6 @@ using namespace lldb_private;
 struct PlatformDarwinTester : public PlatformDarwin {
 public:
   using PlatformDarwin::FindComponentInPath;
-  using PlatformDarwin::FindXcodeContentsDirectoryInPath;
 };
 
 TEST(PlatformDarwinTest, TestParseVersionBuildDir) {
@@ -51,44 +50,6 @@ TEST(PlatformDarwinTest, TestParseVersionBuildDir) {
   EXPECT_EQ(llvm::VersionTuple(3, 4, 5), V);
 }
 
-TEST(PlatformDarwinTest, FindXcodeContentsDirectoryInPath) {
-  std::string standard =
-      "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/"
-      "Developer/SDKs/MacOSX.sdk";
-  EXPECT_EQ("/Applications/Xcode.app/Contents",
-            PlatformDarwinTester::FindXcodeContentsDirectoryInPath(standard));
-
-  std::string standard_version =
-      "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/"
-      "Developer/SDKs/MacOSX10.15.sdk";
-  EXPECT_EQ(
-      "/Applications/Xcode.app/Contents",
-      PlatformDarwinTester::FindXcodeContentsDirectoryInPath(standard_version));
-
-  std::string beta = "/Applications/Xcode-beta.app/Contents/Developer/"
-                     "Platforms/MacOSX.platform/"
-                     "Developer/SDKs/MacOSX10.15.sdk";
-  EXPECT_EQ("/Applications/Xcode-beta.app/Contents",
-            PlatformDarwinTester::FindXcodeContentsDirectoryInPath(beta));
-
-  std::string no_app =
-      "/Applications/Xcode/Contents/Developer/Platforms/MacOSX.platform/"
-      "Developer/SDKs/MacOSX10.15.sdk";
-  EXPECT_EQ("", PlatformDarwinTester::FindXcodeContentsDirectoryInPath(no_app));
-
-  std::string no_contents =
-      "/Applications/Xcode.app/Developer/Platforms/MacOSX.platform/"
-      "Developer/SDKs/MacOSX10.15.sdk";
-  EXPECT_EQ(
-      "", PlatformDarwinTester::FindXcodeContentsDirectoryInPath(no_contents));
-
-  std::string no_capitalization =
-      "/Applications/Xcode.app/contents/Developer/Platforms/MacOSX.platform/"
-      "Developer/SDKs/MacOSX10.15.sdk";
-  EXPECT_EQ("", PlatformDarwinTester::FindXcodeContentsDirectoryInPath(
-                    no_capitalization));
-}
-
 TEST(PlatformDarwinTest, FindComponentInPath) {
   EXPECT_EQ("/path/to/foo",
             PlatformDarwinTester::FindComponentInPath("/path/to/foo/", "foo"));
diff --git a/lldb/unittests/Utility/XcodeSDKTest.cpp b/lldb/unittests/Utility/XcodeSDKTest.cpp
index 21419a299998c..69e4d2caa01ed 100644
--- a/lldb/unittests/Utility/XcodeSDKTest.cpp
+++ b/lldb/unittests/Utility/XcodeSDKTest.cpp
@@ -204,3 +204,38 @@ TEST(XcodeSDKTest, GetSDKTypeForTriple) {
   EXPECT_EQ(XcodeSDK::GetSDKTypeForTriple(llvm::Triple("i386-unknown-netbsd")),
             XcodeSDK::Type::unknown);
 }
+
+TEST(XcodeSDKTest, FindXcodeContentsDirectoryInPath) {
+  std::string standard =
+      "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/"
+      "Developer/SDKs/MacOSX.sdk";
+  EXPECT_EQ("/Applications/Xcode.app/Contents",
+            XcodeSDK::FindXcodeContentsDirectoryInPath(standard));
+
+  std::string standard_version =
+      "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/"
+      "Developer/SDKs/MacOSX10.15.sdk";
+  EXPECT_EQ("/Applications/Xcode.app/Contents",
+            XcodeSDK::FindXcodeContentsDirectoryInPath(standard_version));
+
+  std::string beta = "/Applications/Xcode-beta.app/Contents/Developer/"
+                     "Platforms/MacOSX.platform/"
+                     "Developer/SDKs/MacOSX10.15.sdk";
+  EXPECT_EQ("/Applications/Xcode-beta.app/Contents",
+            XcodeSDK::FindXcodeContentsDirectoryInPath(beta));
+
+  std::string no_app =
+      "/Applications/Xcode/Contents/Developer/Platforms/MacOSX.platform/"
+      "Developer/SDKs/MacOSX10.15.sdk";
+  EXPECT_EQ("", XcodeSDK::FindXcodeContentsDirectoryInPath(no_app));
+
+  std::string no_contents =
+      "/Applications/Xcode.app/Developer/Platforms/MacOSX.platform/"
+      "Developer/SDKs/MacOSX10.15.sdk";
+  EXPECT_EQ("", XcodeSDK::FindXcodeContentsDirectoryInPath(no_contents));
+
+  std::string no_capitalization =
+      "/Applications/Xcode.app/contents/Developer/Platforms/MacOSX.platform/"
+      "Developer/SDKs/MacOSX10.15.sdk";
+  EXPECT_EQ("", XcodeSDK::FindXcodeContentsDirectoryInPath(no_capitalization));
+}

From 79daa3d896495e2755e530ce0658be3e80dfe4c9 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 4 Jun 2020 19:02:11 -0700
Subject: [PATCH 04/24] Teach GetXcodeSDK to look in the Xcode that contains
 LLDB

instead of preferring the one chosen with xcode-select.

<rdar://problem/64000666>

Differential Revision: https://reviews.llvm.org/D81210
---
 lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
index 5c459a0413837..cf2f2dcb3aff8 100644
--- a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
+++ b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
@@ -358,7 +358,21 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) {
   XcodeSDK::Info info = sdk.Parse();
   std::string sdk_name = XcodeSDK::GetCanonicalName(info);
   auto find_sdk = [](std::string sdk_name) -> std::string {
-    std::string xcrun_cmd = "xcrun --show-sdk-path --sdk " + sdk_name;
+    std::string xcrun_cmd;
+    Environment env = Host::GetEnvironment();
+    std::string developer_dir = env.lookup("DEVELOPER_DIR");
+    if (developer_dir.empty())
+      if (FileSpec fspec = HostInfo::GetShlibDir())
+        if (FileSystem::Instance().Exists(fspec)) {
+          FileSpec path(
+              XcodeSDK::FindXcodeContentsDirectoryInPath(fspec.GetPath()));
+          if (path.RemoveLastPathComponent())
+            developer_dir = path.GetPath();
+        }
+    if (!developer_dir.empty())
+      xcrun_cmd = "/usr/bin/env DEVELOPER_DIR=\"" + developer_dir + "\" ";
+    xcrun_cmd += "xcrun --show-sdk-path --sdk " + sdk_name;
+
     int status = 0;
     int signo = 0;
     std::string output_str;

From eaa8af93228c1e1bce078d66605db83b26eda8e5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 30 May 2020 20:28:36 -0400
Subject: [PATCH 05/24] GlobalISel: Add helper for constructing load from
 offset

---
 .../llvm/CodeGen/GlobalISel/MachineIRBuilder.h  |  8 ++++++++
 .../lib/CodeGen/GlobalISel/MachineIRBuilder.cpp | 17 +++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 0252a324de231..ae28d728a16df 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -804,6 +804,14 @@ class MachineIRBuilder {
   MachineInstrBuilder buildLoadInstr(unsigned Opcode, const DstOp &Res,
                                      const SrcOp &Addr, MachineMemOperand &MMO);
 
+  /// Helper to create a load from a constant offset given a base address. Load
+  /// the type of \p Dst from \p Offset from the given base address and memory
+  /// operand.
+  MachineInstrBuilder buildLoadFromOffset(const DstOp &Dst,
+                                          const SrcOp &BasePtr,
+                                          MachineMemOperand &BaseMMO,
+                                          int64_t Offset);
+
   /// Build and insert `G_STORE Val, Addr, MMO`.
   ///
   /// Stores the value \p Val to \p Addr.
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 6e53ec2bb46eb..4236fdd820842 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -376,6 +376,23 @@ MachineInstrBuilder MachineIRBuilder::buildLoadInstr(unsigned Opcode,
   return MIB;
 }
 
+MachineInstrBuilder MachineIRBuilder::buildLoadFromOffset(
+  const DstOp &Dst, const SrcOp &BasePtr,
+  MachineMemOperand &BaseMMO, int64_t Offset) {
+  LLT LoadTy = Dst.getLLTTy(*getMRI());
+  MachineMemOperand *OffsetMMO =
+    getMF().getMachineMemOperand(&BaseMMO, Offset, LoadTy.getSizeInBytes());
+
+  if (Offset == 0) // This may be a size or type changing load.
+    return buildLoad(Dst, BasePtr, *OffsetMMO);
+
+  LLT PtrTy = BasePtr.getLLTTy(*getMRI());
+  LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
+  auto ConstOffset = buildConstant(OffsetTy, Offset);
+  auto Ptr = buildPtrAdd(PtrTy, BasePtr, ConstOffset);
+  return buildLoad(Dst, Ptr, *OffsetMMO);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildStore(const SrcOp &Val,
                                                  const SrcOp &Addr,
                                                  MachineMemOperand &MMO) {

From b67f86020889852d2b318fbf4ec6c10389f8efd8 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alexandre.ganea@ubisoft.com>
Date: Thu, 4 Jun 2020 15:30:45 -0400
Subject: [PATCH 06/24] [llvm-pdbutil] Add missing --id-stats to dump the IPI
 stream

Before this patch, llvm-pdbutil supported only --type-stats to dump stats about a PDB TPI stream.
Adding --id-stats for completion.
---
 llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp | 11 +++++------
 llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp    |  4 ++++
 llvm/tools/llvm-pdbutil/llvm-pdbutil.h      |  1 +
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp b/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
index 32448cec88f6f..aa185e8a2f228 100644
--- a/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
+++ b/llvm/tools/llvm-pdbutil/DumpOutputStyle.cpp
@@ -110,7 +110,7 @@ Error DumpOutputStyle::dump() {
     P.NewLine();
   }
 
-  if (opts::dump::DumpTypeStats) {
+  if (opts::dump::DumpTypeStats || opts::dump::DumpIDStats) {
     if (auto EC = dumpTypeStats())
       return EC;
     P.NewLine();
@@ -701,7 +701,8 @@ Error DumpOutputStyle::dumpTypeStats() {
 
   // Iterate the types, categorize by kind, accumulate size stats.
   StatCollection TypeStats;
-  LazyRandomTypeCollection &Types = File.types();
+  LazyRandomTypeCollection &Types =
+      opts::dump::DumpTypeStats ? File.types() : File.ids();
   for (Optional<TypeIndex> TI = Types.getFirst(); TI; TI = Types.getNext(*TI)) {
     CVType Type = Types.getType(*TI);
     TypeStats.update(uint32_t(Type.kind()), Type.length());
@@ -710,18 +711,16 @@ Error DumpOutputStyle::dumpTypeStats() {
   P.NewLine();
   P.formatLine("  Types");
   AutoIndent Indent(P);
-  P.formatLine("{0,14}: {1,7} entries ({2,12:N} bytes, {3,7} avg)", "Total",
+  P.formatLine("{0,16}: {1,7} entries ({2,12:N} bytes, {3,7} avg)", "Total",
                TypeStats.Totals.Count, TypeStats.Totals.Size,
                (double)TypeStats.Totals.Size / TypeStats.Totals.Count);
   P.formatLine("{0}", fmt_repeat('-', 74));
 
   for (const auto &K : TypeStats.getStatsSortedBySize()) {
-    P.formatLine("{0,14}: {1,7} entries ({2,12:N} bytes, {3,7} avg)",
+    P.formatLine("{0,16}: {1,7} entries ({2,12:N} bytes, {3,7} avg)",
                  formatTypeLeafKind(TypeLeafKind(K.first)), K.second.Count,
                  K.second.Size, (double)K.second.Size / K.second.Count);
   }
-
-
   return Error::success();
 }
 
diff --git a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 5dc666e06a1a9..00092e71c6b49 100644
--- a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -466,6 +466,10 @@ cl::opt<bool> DumpTypeStats(
     "type-stats",
     cl::desc("Dump a detailed breakdown of type usage/size"),
     cl::cat(MsfOptions), cl::sub(DumpSubcommand));
+cl::opt<bool> DumpIDStats(
+    "id-stats",
+    cl::desc("Dump a detailed breakdown of IPI types usage/size"),
+    cl::cat(MsfOptions), cl::sub(DumpSubcommand));
 cl::opt<bool> DumpUdtStats(
     "udt-stats",
     cl::desc("Dump a detailed breakdown of S_UDT record usage / stats"),
diff --git a/llvm/tools/llvm-pdbutil/llvm-pdbutil.h b/llvm/tools/llvm-pdbutil/llvm-pdbutil.h
index 321f41bba7f17..9fe92c2c9d75e 100644
--- a/llvm/tools/llvm-pdbutil/llvm-pdbutil.h
+++ b/llvm/tools/llvm-pdbutil/llvm-pdbutil.h
@@ -141,6 +141,7 @@ extern llvm::cl::opt<bool> DumpFpm;
 extern llvm::cl::opt<bool> DumpStreams;
 extern llvm::cl::opt<bool> DumpSymbolStats;
 extern llvm::cl::opt<bool> DumpTypeStats;
+extern llvm::cl::opt<bool> DumpIDStats;
 extern llvm::cl::opt<bool> DumpUdtStats;
 extern llvm::cl::opt<bool> DumpStreamBlocks;
 

From e78431354bcb6bec5be9adf4ea37d860445f8c16 Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Thu, 4 Jun 2020 17:40:41 -0700
Subject: [PATCH 07/24] lld: use modern library search ordering

This merges the static and shared library and behaves as if
`-search_paths_first` was specified which is also the default behaviour
on ld64 (and now lld). Unify the paths, and use `llvm::sys::path` to
deal with the path to be truly agnostic to the host.
---
 lld/MachO/Driver.cpp               | 32 ++++++++++------------
 lld/test/MachO/link-search-order.s | 43 ++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 18 deletions(-)
 create mode 100644 lld/test/MachO/link-search-order.s

diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 723c62c391850..e0ebf8e5710fd 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -73,21 +73,18 @@ opt::InputArgList MachOOptTable::parse(ArrayRef<const char *> argv) {
   return args;
 }
 
-// This is for -lfoo. We'll look for libfoo.dylib from search paths.
-static Optional<std::string> findDylib(StringRef name) {
-  for (StringRef dir : config->searchPaths) {
-    std::string path = (dir + "/lib" + name + ".dylib").str();
-    if (fs::exists(path))
-      return path;
-  }
-  return None;
-}
+static Optional<std::string> findLibrary(StringRef name) {
+  std::string shared = (llvm::Twine("lib") + name + ".dylib").str();
+  std::string archive = (llvm::Twine("lib") + name + ".a").str();
+  llvm::SmallString<260> location;
 
-static Optional<std::string> findArchive(StringRef name) {
   for (StringRef dir : config->searchPaths) {
-    std::string path = (dir + "/lib" + name + ".a").str();
-    if (fs::exists(path))
-      return path;
+    for (StringRef library : {shared, archive}) {
+      location = dir;
+      llvm::sys::path::append(location, library);
+      if (fs::exists(location))
+        return location.str().str();
+    }
   }
   return None;
 }
@@ -296,12 +293,11 @@ bool macho::link(llvm::ArrayRef<const char *> argsArr, bool canExitEarly,
       break;
     case OPT_l: {
       StringRef name = arg->getValue();
-      if (Optional<std::string> path = findDylib(name))
+      if (Optional<std::string> path = findLibrary(name)) {
         addFile(*path);
-      else if (Optional<std::string> path = findArchive(name))
-        addFile(*path);
-      else
-        error("library not found for -l" + name);
+        break;
+      }
+      error("library not found for -l" + name);
       break;
     }
     case OPT_platform_version: {
diff --git a/lld/test/MachO/link-search-order.s b/lld/test/MachO/link-search-order.s
new file mode 100644
index 0000000000000..289293a8cb471
--- /dev/null
+++ b/lld/test/MachO/link-search-order.s
@@ -0,0 +1,43 @@
+# REQUIRES: x86
+
+# RUN: mkdir -p %t
+#
+# RUN: llvm-mc -filetype obj -triple x86_64-apple-darwin %p/Inputs/libhello.s -o %t/hello.o
+# RUN: lld -flavor darwinnew -dylib -install_name @executable_path/libhello.dylib %t/hello.o -o %t/libhello.dylib
+#
+# RUN: llvm-mc -filetype obj -triple x86_64-apple-darwin %p/Inputs/libgoodbye.s -o %t/goodbye.o
+# RUN: lld -flavor darwinnew -dylib -install_name @executable_path/libgoodbye.dylib %t/goodbye.o -o %t/libgoodbye.dylib
+# RUN: llvm-ar --format=darwin crs %t/libgoodbye.a %t/goodbye.o
+#
+# RUN: llvm-mc -filetype obj -triple x86_64-apple-darwin %s -o %t/test.o
+# RUN: lld -flavor darwinnew -o %t/test -Z -L%t -lhello -lgoodbye %t/test.o
+#
+# RUN: llvm-objdump --macho --dylibs-used %t/test | FileCheck %s
+
+# CHECK: @executable_path/libhello.dylib
+# CHECK: @executable_path/libgoodbye.dylib
+# CHECK: /usr/lib/libSystem.B.dylib
+
+.section __TEXT,__text
+.global _main
+
+_main:
+  movl $0x2000004, %eax                         # write()
+  mov $1, %rdi                                  # stdout
+  movq _hello_world@GOTPCREL(%rip), %rsi
+  mov $13, %rdx                                 # length
+  syscall
+
+  movl $0x2000004, %eax                         # write()
+  mov $1, %rdi                                  # stdout
+  movq _hello_its_me@GOTPCREL(%rip), %rsi
+  mov $15, %rdx                                 # length
+  syscall
+
+  movl $0x2000004, %eax                         # write()
+  mov $1, %rdi                                  # stdout
+  movq _goodbye_world@GOTPCREL(%rip), %rsi
+  mov $15, %rdx                                 # length
+  syscall
+  mov $0, %rax
+  ret

From 392843617acbce5baf6ca7c7fff8dbac58bf853f Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Fri, 5 Jun 2020 15:31:11 -0400
Subject: [PATCH 08/24] Attempt to fix hip-include-path.hip

---
 clang/test/Driver/hip-include-path.hip | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/test/Driver/hip-include-path.hip b/clang/test/Driver/hip-include-path.hip
index 12d6aa2a5216c..8c0d317936b57 100644
--- a/clang/test/Driver/hip-include-path.hip
+++ b/clang/test/Driver/hip-include-path.hip
@@ -1,6 +1,8 @@
 // REQUIRES: clang-driver
 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target
+// REQUIRES: libgcc
+// UNSUPPORTED: system-windows
 
 // RUN: %clang -c -v -target x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \
 // RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm -nogpulib %s 2>&1 \
@@ -14,7 +16,7 @@
 // RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm -nogpuinc -nogpulib %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,WRAP,NOHIP %s
 
-// COMMON-LABEL: clang{{.*}} -cc1 -triple amdgcn-amd-amdhsa
+// COMMON: "{{[^"]*}}clang{{[^"]*}}"
 // WRAP: clang/{{.*}}/include/cuda_wrappers
 // NOWRAP-NOT: clang/{{.*}}/include/cuda_wrappers
 // HIP: {{.*}}Inputs/rocm/include
@@ -22,7 +24,7 @@
 // skip check of standard C++ include path
 // COMMON: clang/{{.*}}/include
 
-// COMMON-LABEL: clang{{.*}} -cc1 -triple x86_64
+// COMMON: "{{[^"]*}}clang{{[^"]*}}"
 // WRAP: clang/{{.*}}/include/cuda_wrappers
 // NOWRAP-NOT: clang/{{.*}}/include/cuda_wrappers
 // HIP: {{.*}}Inputs/rocm/include

From 8a8c6913a931e8bbd119012f4badd81155a0f48a Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Fri, 5 Jun 2020 15:41:00 -0400
Subject: [PATCH 09/24] Revert "[HIP] Add default header and include path"

This reverts commit 11d06b9511bd25aabbfad10dff548b0ce29135a5.
---
 .../clang/Basic/DiagnosticDriverKinds.td      |    2 +-
 clang/include/clang/Driver/Options.td         |    3 +-
 clang/include/clang/Driver/ToolChain.h        |    4 -
 clang/lib/Driver/ToolChain.cpp                |    3 -
 clang/lib/Driver/ToolChains/AMDGPU.cpp        |   34 -
 clang/lib/Driver/ToolChains/AMDGPU.h          |  143 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |   10 +-
 clang/lib/Driver/ToolChains/Cuda.cpp          |    4 +-
 clang/lib/Driver/ToolChains/Gnu.cpp           |    2 +-
 clang/lib/Driver/ToolChains/Gnu.h             |    2 -
 clang/lib/Driver/ToolChains/HIP.cpp           |    5 -
 clang/lib/Driver/ToolChains/HIP.h             |    2 -
 clang/lib/Driver/ToolChains/Linux.cpp         |    5 -
 clang/lib/Driver/ToolChains/Linux.h           |    2 -
 clang/lib/Driver/ToolChains/MSVC.cpp          |    8 +-
 clang/lib/Driver/ToolChains/MSVC.h            |    5 -
 clang/lib/Driver/ToolChains/ROCm.h            |  166 ---
 clang/lib/Headers/CMakeLists.txt              |    3 -
 .../__clang_cuda_math_forward_declares.h      |    4 +-
 .../Headers/__clang_hip_libdevice_declares.h  |  326 -----
 clang/lib/Headers/__clang_hip_math.h          | 1185 -----------------
 .../lib/Headers/__clang_hip_runtime_wrapper.h |   64 -
 .../amdgcn/bitcode/hip.bc                     |    0
 .../amdgcn/bitcode/ockl.bc                    |    0
 .../oclc_correctly_rounded_sqrt_off.bc        |    0
 .../bitcode/oclc_correctly_rounded_sqrt_on.bc |    0
 .../amdgcn/bitcode/oclc_daz_opt_off.bc        |    0
 .../amdgcn/bitcode/oclc_daz_opt_on.bc         |    0
 .../amdgcn/bitcode/oclc_finite_only_off.bc    |    0
 .../amdgcn/bitcode/oclc_finite_only_on.bc     |    0
 .../amdgcn/bitcode/oclc_isa_version_1010.bc   |    0
 .../amdgcn/bitcode/oclc_isa_version_1011.bc   |    0
 .../amdgcn/bitcode/oclc_isa_version_1012.bc   |    0
 .../amdgcn/bitcode/oclc_isa_version_803.bc    |    0
 .../amdgcn/bitcode/oclc_isa_version_900.bc    |    0
 .../amdgcn/bitcode/oclc_unsafe_math_off.bc    |    0
 .../amdgcn/bitcode/oclc_unsafe_math_on.bc     |    0
 .../bitcode/oclc_wavefrontsize64_off.bc       |    0
 .../amdgcn/bitcode/oclc_wavefrontsize64_on.bc |    0
 .../amdgcn/bitcode/ocml.bc                    |    0
 .../amdgcn/bitcode/opencl.bc                  |    0
 .../Inputs/rocm/include/hip/hip_runtime.h     |    0
 clang/test/Driver/hip-device-libs.hip         |   24 +-
 clang/test/Driver/hip-include-path.hip        |   33 -
 clang/test/Driver/rocm-detect.cl              |    4 +-
 clang/test/Driver/rocm-detect.hip             |    6 +-
 clang/test/Driver/rocm-device-libs.cl         |   34 +-
 clang/test/Driver/rocm-not-found.cl           |    2 +-
 .../test/Preprocessor/hip-host-cpu-macros.cu  |    2 +-
 49 files changed, 190 insertions(+), 1897 deletions(-)
 delete mode 100644 clang/lib/Driver/ToolChains/ROCm.h
 delete mode 100644 clang/lib/Headers/__clang_hip_libdevice_declares.h
 delete mode 100644 clang/lib/Headers/__clang_hip_math.h
 delete mode 100644 clang/lib/Headers/__clang_hip_runtime_wrapper.h
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/hip.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/ockl.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_daz_opt_off.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_daz_opt_on.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_finite_only_off.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_finite_only_on.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_isa_version_1010.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_isa_version_1011.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_isa_version_1012.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_isa_version_803.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_isa_version_900.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_unsafe_math_off.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_unsafe_math_on.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_wavefrontsize64_off.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/oclc_wavefrontsize64_on.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/ocml.bc (100%)
 rename clang/test/Driver/Inputs/{rocm => rocm-device-libs}/amdgcn/bitcode/opencl.bc (100%)
 delete mode 100644 clang/test/Driver/Inputs/rocm/include/hip/hip_runtime.h
 delete mode 100644 clang/test/Driver/hip-include-path.hip

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 80242e53a1aeb..d010a7dfb2de9 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -58,7 +58,7 @@ def err_drv_no_cuda_libdevice : Error<
 
 def err_drv_no_rocm_installation : Error<
   "cannot find ROCm installation.  Provide its path via --rocm-path, or pass "
-  "-nogpulib and -nogpuinc to build without ROCm device library and HIP includes.">;
+  "-nogpulib.">;
 def err_drv_no_rocm_device_lib : Error<
   "cannot find device library for %0. Provide path to different ROCm installation "
   "via --rocm-path, or pass -nogpulib to build without linking default libraries.">;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c7cae452215a8..d3f0ccb09ef4f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2640,8 +2640,7 @@ def no_pedantic : Flag<["-", "--"], "no-pedantic">, Group<pedantic_Group>;
 def no__dead__strip__inits__and__terms : Flag<["-"], "no_dead_strip_inits_and_terms">;
 def nobuiltininc : Flag<["-"], "nobuiltininc">, Flags<[CC1Option, CoreOption]>,
   HelpText<"Disable builtin #include directories">;
-def nogpuinc : Flag<["-"], "nogpuinc">;
-def : Flag<["-"], "nocudainc">, Alias<nogpuinc>;
+def nocudainc : Flag<["-"], "nocudainc">;
 def nogpulib : Flag<["-"], "nogpulib">,
   HelpText<"Do not link device library for CUDA/HIP device compilation">;
 def : Flag<["-"], "nocudalib">, Alias<nogpulib>;
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index e8bb86be55540..0a35e9e6a01a0 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -617,10 +617,6 @@ class ToolChain {
   virtual void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                   llvm::opt::ArgStringList &CC1Args) const;
 
-  /// Add arguments to use system-specific HIP includes.
-  virtual void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                                 llvm::opt::ArgStringList &CC1Args) const;
-
   /// Add arguments to use MCU GCC toolchain includes.
   virtual void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                                    llvm::opt::ArgStringList &CC1Args) const;
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 2bda7de8ac3a6..cf04fd07e2a0e 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -991,9 +991,6 @@ SanitizerMask ToolChain::getSupportedSanitizers() const {
 void ToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                    ArgStringList &CC1Args) const {}
 
-void ToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
-                                  ArgStringList &CC1Args) const {}
-
 void ToolChain::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
                                     ArgStringList &CC1Args) const {}
 
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 44a7e7fc3be04..3e51bd00bae47 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -199,40 +199,6 @@ void RocmInstallationDetector::print(raw_ostream &OS) const {
     OS << "Found ROCm installation: " << InstallPath << '\n';
 }
 
-void RocmInstallationDetector::AddHIPIncludeArgs(const ArgList &DriverArgs,
-                                                 ArgStringList &CC1Args) const {
-  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
-    // HIP header includes standard library wrapper headers under clang
-    // cuda_wrappers directory. Since these wrapper headers include_next
-    // standard C++ headers, whereas libc++ headers include_next other clang
-    // headers. The include paths have to follow this order:
-    // - wrapper include path
-    // - standard C++ include path
-    // - other clang include path
-    // Since standard C++ and other clang include paths are added in other
-    // places after this function, here we only need to make sure wrapper
-    // include path is added.
-    SmallString<128> P(D.ResourceDir);
-    llvm::sys::path::append(P, "include");
-    llvm::sys::path::append(P, "cuda_wrappers");
-    CC1Args.push_back("-internal-isystem");
-    CC1Args.push_back(DriverArgs.MakeArgString(P));
-    CC1Args.push_back("-include");
-    CC1Args.push_back("__clang_hip_runtime_wrapper.h");
-  }
-
-  if (DriverArgs.hasArg(options::OPT_nogpuinc))
-    return;
-
-  if (!isValid()) {
-    D.Diag(diag::err_drv_no_rocm_installation);
-    return;
-  }
-
-  CC1Args.push_back("-internal-isystem");
-  CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath()));
-}
-
 void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                   const InputInfo &Output,
                                   const InputInfoList &Inputs,
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h
index 9418a0a509c7a..230af868298ff 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.h
+++ b/clang/lib/Driver/ToolChains/AMDGPU.h
@@ -10,7 +10,6 @@
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_AMDGPU_H
 
 #include "Gnu.h"
-#include "ROCm.h"
 #include "clang/Driver/Options.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
@@ -22,6 +21,148 @@
 namespace clang {
 namespace driver {
 
+/// A class to find a viable ROCM installation
+/// TODO: Generalize to handle libclc.
+class RocmInstallationDetector {
+private:
+  struct ConditionalLibrary {
+    SmallString<0> On;
+    SmallString<0> Off;
+
+    bool isValid() const {
+      return !On.empty() && !Off.empty();
+    }
+
+    StringRef get(bool Enabled) const {
+      assert(isValid());
+      return Enabled ? On : Off;
+    }
+  };
+
+  const Driver &D;
+  bool IsValid = false;
+  //RocmVersion Version = RocmVersion::UNKNOWN;
+  SmallString<0> InstallPath;
+  //SmallString<0> BinPath;
+  SmallString<0> LibPath;
+  SmallString<0> LibDevicePath;
+  SmallString<0> IncludePath;
+  llvm::StringMap<std::string> LibDeviceMap;
+
+  // Libraries that are always linked.
+  SmallString<0> OCML;
+  SmallString<0> OCKL;
+
+  // Libraries that are always linked depending on the language
+  SmallString<0> OpenCL;
+  SmallString<0> HIP;
+
+  // Libraries swapped based on compile flags.
+  ConditionalLibrary WavefrontSize64;
+  ConditionalLibrary FiniteOnly;
+  ConditionalLibrary UnsafeMath;
+  ConditionalLibrary DenormalsAreZero;
+  ConditionalLibrary CorrectlyRoundedSqrt;
+
+  bool allGenericLibsValid() const {
+    return !OCML.empty() && !OCKL.empty() && !OpenCL.empty() && !HIP.empty() &&
+           WavefrontSize64.isValid() && FiniteOnly.isValid() &&
+           UnsafeMath.isValid() && DenormalsAreZero.isValid() &&
+           CorrectlyRoundedSqrt.isValid();
+  }
+
+  // CUDA architectures for which we have raised an error in
+  // CheckRocmVersionSupportsArch.
+  mutable llvm::SmallSet<CudaArch, 4> ArchsWithBadVersion;
+
+  void scanLibDevicePath();
+
+public:
+  RocmInstallationDetector(const Driver &D, const llvm::Triple &HostTriple,
+                           const llvm::opt::ArgList &Args);
+
+  /// Add arguments needed to link default bitcode libraries.
+  void addCommonBitcodeLibCC1Args(const llvm::opt::ArgList &DriverArgs,
+                                  llvm::opt::ArgStringList &CC1Args,
+                                  StringRef LibDeviceFile, bool Wave64,
+                                  bool DAZ, bool FiniteOnly, bool UnsafeMathOpt,
+                                  bool FastRelaxedMath, bool CorrectSqrt) const;
+
+  /// Emit an error if Version does not support the given Arch.
+  ///
+  /// If either Version or Arch is unknown, does not emit an error.  Emits at
+  /// most one error per Arch.
+  void CheckRocmVersionSupportsArch(CudaArch Arch) const;
+
+  /// Check whether we detected a valid Rocm install.
+  bool isValid() const { return IsValid; }
+  /// Print information about the detected CUDA installation.
+  void print(raw_ostream &OS) const;
+
+  /// Get the detected Rocm install's version.
+  // RocmVersion version() const { return Version; }
+
+  /// Get the detected Rocm installation path.
+  StringRef getInstallPath() const { return InstallPath; }
+
+  /// Get the detected path to Rocm's bin directory.
+  // StringRef getBinPath() const { return BinPath; }
+
+  /// Get the detected Rocm Include path.
+  StringRef getIncludePath() const { return IncludePath; }
+
+  /// Get the detected Rocm library path.
+  StringRef getLibPath() const { return LibPath; }
+
+  /// Get the detected Rocm device library path.
+  StringRef getLibDevicePath() const { return LibDevicePath; }
+
+  StringRef getOCMLPath() const {
+    assert(!OCML.empty());
+    return OCML;
+  }
+
+  StringRef getOCKLPath() const {
+    assert(!OCKL.empty());
+    return OCKL;
+  }
+
+  StringRef getOpenCLPath() const {
+    assert(!OpenCL.empty());
+    return OpenCL;
+  }
+
+  StringRef getHIPPath() const {
+    assert(!HIP.empty());
+    return HIP;
+  }
+
+  StringRef getWavefrontSize64Path(bool Enabled) const {
+    return WavefrontSize64.get(Enabled);
+  }
+
+  StringRef getFiniteOnlyPath(bool Enabled) const {
+    return FiniteOnly.get(Enabled);
+  }
+
+  StringRef getUnsafeMathPath(bool Enabled) const {
+    return UnsafeMath.get(Enabled);
+  }
+
+  StringRef getDenormalsAreZeroPath(bool Enabled) const {
+    return DenormalsAreZero.get(Enabled);
+  }
+
+  StringRef getCorrectlyRoundedSqrtPath(bool Enabled) const {
+    return CorrectlyRoundedSqrt.get(Enabled);
+  }
+
+  /// Get libdevice file for given architecture
+  std::string getLibDeviceFile(StringRef Gpu) const {
+    return LibDeviceMap.lookup(Gpu);
+  }
+};
+
 namespace tools {
 namespace amdgpu {
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 32e60c13e1d89..b20048768e444 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1202,14 +1202,12 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
   Args.AddLastArg(CmdArgs, options::OPT_MP);
   Args.AddLastArg(CmdArgs, options::OPT_MV);
 
-  // Add offload include arguments specific for CUDA/HIP.  This must happen
-  // before we -I or -include anything else, because we must pick up the
-  // CUDA/HIP headers from the particular CUDA/ROCm installation, rather than
-  // from e.g. /usr/local/include.
+  // Add offload include arguments specific for CUDA.  This must happen before
+  // we -I or -include anything else, because we must pick up the CUDA headers
+  // from the particular CUDA installation, rather than from e.g.
+  // /usr/local/include.
   if (JA.isOffloading(Action::OFK_Cuda))
     getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
-  if (JA.isOffloading(Action::OFK_HIP))
-    getToolChain().AddHIPIncludeArgs(Args, CmdArgs);
 
   // If we are offloading to a target via OpenMP we need to include the
   // openmp_wrappers folder which contains alternative system headers.
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 55b205921e390..08064de13b5b2 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -241,7 +241,7 @@ void CudaInstallationDetector::AddCudaIncludeArgs(
     CC1Args.push_back(DriverArgs.MakeArgString(P));
   }
 
-  if (DriverArgs.hasArg(options::OPT_nogpuinc))
+  if (DriverArgs.hasArg(options::OPT_nocudainc))
     return;
 
   if (!isValid()) {
@@ -765,7 +765,7 @@ void CudaToolChain::adjustDebugInfoKind(
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                        ArgStringList &CC1Args) const {
   // Check our CUDA version if we're going to include the CUDA headers.
-  if (!DriverArgs.hasArg(options::OPT_nogpuinc) &&
+  if (!DriverArgs.hasArg(options::OPT_nocudainc) &&
       !DriverArgs.hasArg(options::OPT_no_cuda_version_check)) {
     StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
     assert(!Arch.empty() && "Must have an explicit GPU arch.");
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index a392fa00ea75a..ac9eb46dacb51 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -2578,7 +2578,7 @@ bool Generic_GCC::GCCInstallationDetector::ScanGentooGccConfig(
 Generic_GCC::Generic_GCC(const Driver &D, const llvm::Triple &Triple,
                          const ArgList &Args)
     : ToolChain(D, Triple, Args), GCCInstallation(D),
-      CudaInstallation(D, Triple, Args), RocmInstallation(D, Triple, Args) {
+      CudaInstallation(D, Triple, Args) {
   getProgramPaths().push_back(getDriver().getInstalledDir());
   if (getDriver().getInstalledDir() != getDriver().Dir)
     getProgramPaths().push_back(getDriver().Dir);
diff --git a/clang/lib/Driver/ToolChains/Gnu.h b/clang/lib/Driver/ToolChains/Gnu.h
index 8ef9b4fdb6cd9..e43414ae35f01 100644
--- a/clang/lib/Driver/ToolChains/Gnu.h
+++ b/clang/lib/Driver/ToolChains/Gnu.h
@@ -10,7 +10,6 @@
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_GNU_H
 
 #include "Cuda.h"
-#include "ROCm.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 #include <set>
@@ -279,7 +278,6 @@ class LLVM_LIBRARY_VISIBILITY Generic_GCC : public ToolChain {
 protected:
   GCCInstallationDetector GCCInstallation;
   CudaInstallationDetector CudaInstallation;
-  RocmInstallationDetector RocmInstallation;
 
 public:
   Generic_GCC(const Driver &D, const llvm::Triple &Triple,
diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index 7e58fe7bb3fb1..a7510f721145d 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -427,11 +427,6 @@ void HIPToolChain::AddIAMCUIncludeArgs(const ArgList &Args,
   HostTC.AddIAMCUIncludeArgs(Args, CC1Args);
 }
 
-void HIPToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
-                                     ArgStringList &CC1Args) const {
-  RocmInstallation.AddHIPIncludeArgs(DriverArgs, CC1Args);
-}
-
 SanitizerMask HIPToolChain::getSupportedSanitizers() const {
   // The HIPToolChain only supports sanitizers in the sense that it allows
   // sanitizer arguments on the command line if they are supported by the host
diff --git a/clang/lib/Driver/ToolChains/HIP.h b/clang/lib/Driver/ToolChains/HIP.h
index 353775e5bbf77..01a0ee916bc08 100644
--- a/clang/lib/Driver/ToolChains/HIP.h
+++ b/clang/lib/Driver/ToolChains/HIP.h
@@ -107,8 +107,6 @@ class LLVM_LIBRARY_VISIBILITY HIPToolChain final : public ROCMToolChain {
       llvm::opt::ArgStringList &CC1Args) const override;
   void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                            llvm::opt::ArgStringList &CC1Args) const override;
-  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                         llvm::opt::ArgStringList &CC1Args) const override;
 
   SanitizerMask getSupportedSanitizers() const override;
 
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 7df49c787c8e4..8188c972f4466 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -797,11 +797,6 @@ void Linux::AddCudaIncludeArgs(const ArgList &DriverArgs,
   CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
-void Linux::AddHIPIncludeArgs(const ArgList &DriverArgs,
-                              ArgStringList &CC1Args) const {
-  RocmInstallation.AddHIPIncludeArgs(DriverArgs, CC1Args);
-}
-
 void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
                                 ArgStringList &CC1Args) const {
   if (GCCInstallation.isValid()) {
diff --git a/clang/lib/Driver/ToolChains/Linux.h b/clang/lib/Driver/ToolChains/Linux.h
index 0a7e5bac25b11..550cb96b0b9a8 100644
--- a/clang/lib/Driver/ToolChains/Linux.h
+++ b/clang/lib/Driver/ToolChains/Linux.h
@@ -31,8 +31,6 @@ class LLVM_LIBRARY_VISIBILITY Linux : public Generic_ELF {
       llvm::opt::ArgStringList &CC1Args) const override;
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                           llvm::opt::ArgStringList &CC1Args) const override;
-  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                         llvm::opt::ArgStringList &CC1Args) const override;
   void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                            llvm::opt::ArgStringList &CC1Args) const override;
   CXXStdlibType GetDefaultCXXStdlibType() const override;
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index 8271ca780f801..c8b272c609198 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -739,8 +739,7 @@ std::unique_ptr<Command> visualstudio::Compiler::GetCommand(
 
 MSVCToolChain::MSVCToolChain(const Driver &D, const llvm::Triple &Triple,
                              const ArgList &Args)
-    : ToolChain(D, Triple, Args), CudaInstallation(D, Triple, Args),
-      RocmInstallation(D, Triple, Args) {
+    : ToolChain(D, Triple, Args), CudaInstallation(D, Triple, Args) {
   getProgramPaths().push_back(getDriver().getInstalledDir());
   if (getDriver().getInstalledDir() != getDriver().Dir)
     getProgramPaths().push_back(getDriver().Dir);
@@ -798,11 +797,6 @@ void MSVCToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
   CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
-void MSVCToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
-                                      ArgStringList &CC1Args) const {
-  RocmInstallation.AddHIPIncludeArgs(DriverArgs, CC1Args);
-}
-
 void MSVCToolChain::printVerboseInfo(raw_ostream &OS) const {
   CudaInstallation.print(OS);
 }
diff --git a/clang/lib/Driver/ToolChains/MSVC.h b/clang/lib/Driver/ToolChains/MSVC.h
index 85208eaa3cc31..41a69a82fecfe 100644
--- a/clang/lib/Driver/ToolChains/MSVC.h
+++ b/clang/lib/Driver/ToolChains/MSVC.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MSVC_H
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MSVC_H
 
-#include "AMDGPU.h"
 #include "Cuda.h"
 #include "clang/Basic/DebugInfoOptions.h"
 #include "clang/Driver/Compilation.h"
@@ -126,9 +125,6 @@ class LLVM_LIBRARY_VISIBILITY MSVCToolChain : public ToolChain {
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                           llvm::opt::ArgStringList &CC1Args) const override;
 
-  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                         llvm::opt::ArgStringList &CC1Args) const override;
-
   bool getWindowsSDKLibraryPath(std::string &path) const;
   /// Check if Universal CRT should be used if available
   bool getUniversalCRTLibraryPath(std::string &path) const;
@@ -159,7 +155,6 @@ class LLVM_LIBRARY_VISIBILITY MSVCToolChain : public ToolChain {
   std::string VCToolChainPath;
   ToolsetLayout VSLayout = ToolsetLayout::OlderVS;
   CudaInstallationDetector CudaInstallation;
-  RocmInstallationDetector RocmInstallation;
 };
 
 } // end namespace toolchains
diff --git a/clang/lib/Driver/ToolChains/ROCm.h b/clang/lib/Driver/ToolChains/ROCm.h
deleted file mode 100644
index 9f5fa451472bc..0000000000000
--- a/clang/lib/Driver/ToolChains/ROCm.h
+++ /dev/null
@@ -1,166 +0,0 @@
-//===--- ROCm.h - ROCm installation detector --------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ROCM_H
-#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ROCM_H
-
-#include "clang/Basic/Cuda.h"
-#include "clang/Driver/Options.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallString.h"
-
-namespace clang {
-namespace driver {
-
-/// A class to find a viable ROCM installation
-/// TODO: Generalize to handle libclc.
-class RocmInstallationDetector {
-private:
-  struct ConditionalLibrary {
-    SmallString<0> On;
-    SmallString<0> Off;
-
-    bool isValid() const { return !On.empty() && !Off.empty(); }
-
-    StringRef get(bool Enabled) const {
-      assert(isValid());
-      return Enabled ? On : Off;
-    }
-  };
-
-  const Driver &D;
-  bool IsValid = false;
-  // RocmVersion Version = RocmVersion::UNKNOWN;
-  SmallString<0> InstallPath;
-  // SmallString<0> BinPath;
-  SmallString<0> LibPath;
-  SmallString<0> LibDevicePath;
-  SmallString<0> IncludePath;
-  llvm::StringMap<std::string> LibDeviceMap;
-
-  // Libraries that are always linked.
-  SmallString<0> OCML;
-  SmallString<0> OCKL;
-
-  // Libraries that are always linked depending on the language
-  SmallString<0> OpenCL;
-  SmallString<0> HIP;
-
-  // Libraries swapped based on compile flags.
-  ConditionalLibrary WavefrontSize64;
-  ConditionalLibrary FiniteOnly;
-  ConditionalLibrary UnsafeMath;
-  ConditionalLibrary DenormalsAreZero;
-  ConditionalLibrary CorrectlyRoundedSqrt;
-
-  bool allGenericLibsValid() const {
-    return !OCML.empty() && !OCKL.empty() && !OpenCL.empty() && !HIP.empty() &&
-           WavefrontSize64.isValid() && FiniteOnly.isValid() &&
-           UnsafeMath.isValid() && DenormalsAreZero.isValid() &&
-           CorrectlyRoundedSqrt.isValid();
-  }
-
-  // GPU architectures for which we have raised an error in
-  // CheckRocmVersionSupportsArch.
-  mutable llvm::SmallSet<CudaArch, 4> ArchsWithBadVersion;
-
-  void scanLibDevicePath();
-
-public:
-  RocmInstallationDetector(const Driver &D, const llvm::Triple &HostTriple,
-                           const llvm::opt::ArgList &Args);
-
-  /// Add arguments needed to link default bitcode libraries.
-  void addCommonBitcodeLibCC1Args(const llvm::opt::ArgList &DriverArgs,
-                                  llvm::opt::ArgStringList &CC1Args,
-                                  StringRef LibDeviceFile, bool Wave64,
-                                  bool DAZ, bool FiniteOnly, bool UnsafeMathOpt,
-                                  bool FastRelaxedMath, bool CorrectSqrt) const;
-
-  /// Emit an error if Version does not support the given Arch.
-  ///
-  /// If either Version or Arch is unknown, does not emit an error.  Emits at
-  /// most one error per Arch.
-  void CheckRocmVersionSupportsArch(CudaArch Arch) const;
-
-  /// Check whether we detected a valid Rocm install.
-  bool isValid() const { return IsValid; }
-  /// Print information about the detected ROCm installation.
-  void print(raw_ostream &OS) const;
-
-  /// Get the detected Rocm install's version.
-  // RocmVersion version() const { return Version; }
-
-  /// Get the detected Rocm installation path.
-  StringRef getInstallPath() const { return InstallPath; }
-
-  /// Get the detected path to Rocm's bin directory.
-  // StringRef getBinPath() const { return BinPath; }
-
-  /// Get the detected Rocm Include path.
-  StringRef getIncludePath() const { return IncludePath; }
-
-  /// Get the detected Rocm library path.
-  StringRef getLibPath() const { return LibPath; }
-
-  /// Get the detected Rocm device library path.
-  StringRef getLibDevicePath() const { return LibDevicePath; }
-
-  StringRef getOCMLPath() const {
-    assert(!OCML.empty());
-    return OCML;
-  }
-
-  StringRef getOCKLPath() const {
-    assert(!OCKL.empty());
-    return OCKL;
-  }
-
-  StringRef getOpenCLPath() const {
-    assert(!OpenCL.empty());
-    return OpenCL;
-  }
-
-  StringRef getHIPPath() const {
-    assert(!HIP.empty());
-    return HIP;
-  }
-
-  StringRef getWavefrontSize64Path(bool Enabled) const {
-    return WavefrontSize64.get(Enabled);
-  }
-
-  StringRef getFiniteOnlyPath(bool Enabled) const {
-    return FiniteOnly.get(Enabled);
-  }
-
-  StringRef getUnsafeMathPath(bool Enabled) const {
-    return UnsafeMath.get(Enabled);
-  }
-
-  StringRef getDenormalsAreZeroPath(bool Enabled) const {
-    return DenormalsAreZero.get(Enabled);
-  }
-
-  StringRef getCorrectlyRoundedSqrtPath(bool Enabled) const {
-    return CorrectlyRoundedSqrt.get(Enabled);
-  }
-
-  /// Get libdevice file for given architecture
-  std::string getLibDeviceFile(StringRef Gpu) const {
-    return LibDeviceMap.lookup(Gpu);
-  }
-
-  void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                         llvm::opt::ArgStringList &CC1Args) const;
-};
-
-} // end namespace driver
-} // end namespace clang
-
-#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_ROCM_H
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index fd9e3a0d672f6..1a1f7b30f1067 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -45,9 +45,6 @@ set(files
   __clang_cuda_libdevice_declares.h
   __clang_cuda_math_forward_declares.h
   __clang_cuda_runtime_wrapper.h
-  __clang_hip_libdevice_declares.h
-  __clang_hip_math.h
-  __clang_hip_runtime_wrapper.h
   cetintrin.h
   cet.h
   cldemoteintrin.h
diff --git a/clang/lib/Headers/__clang_cuda_math_forward_declares.h b/clang/lib/Headers/__clang_cuda_math_forward_declares.h
index 7c0b3575b25ad..3d6d0b9115a17 100644
--- a/clang/lib/Headers/__clang_cuda_math_forward_declares.h
+++ b/clang/lib/Headers/__clang_cuda_math_forward_declares.h
@@ -8,8 +8,8 @@
  */
 #ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
 #define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
-#if !__CUDA__ && !__HIP__
-#error "This file is for CUDA/HIP compilation only."
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
 #endif
 
 // This file forward-declares of some math functions we (or the CUDA headers)
diff --git a/clang/lib/Headers/__clang_hip_libdevice_declares.h b/clang/lib/Headers/__clang_hip_libdevice_declares.h
deleted file mode 100644
index e1cd49a39c65d..0000000000000
--- a/clang/lib/Headers/__clang_hip_libdevice_declares.h
+++ /dev/null
@@ -1,326 +0,0 @@
-/*===---- __clang_hip_libdevice_declares.h - HIP device library decls -------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__
-#define __CLANG_HIP_LIBDEVICE_DECLARES_H__
-
-extern "C" {
-
-// BEGIN FLOAT
-__device__ __attribute__((const)) float __ocml_acos_f32(float);
-__device__ __attribute__((pure)) float __ocml_acosh_f32(float);
-__device__ __attribute__((const)) float __ocml_asin_f32(float);
-__device__ __attribute__((pure)) float __ocml_asinh_f32(float);
-__device__ __attribute__((const)) float __ocml_atan2_f32(float, float);
-__device__ __attribute__((const)) float __ocml_atan_f32(float);
-__device__ __attribute__((pure)) float __ocml_atanh_f32(float);
-__device__ __attribute__((pure)) float __ocml_cbrt_f32(float);
-__device__ __attribute__((const)) float __ocml_ceil_f32(float);
-__device__ __attribute__((const)) __device__ float __ocml_copysign_f32(float,
-                                                                       float);
-__device__ float __ocml_cos_f32(float);
-__device__ float __ocml_native_cos_f32(float);
-__device__ __attribute__((pure)) __device__ float __ocml_cosh_f32(float);
-__device__ float __ocml_cospi_f32(float);
-__device__ float __ocml_i0_f32(float);
-__device__ float __ocml_i1_f32(float);
-__device__ __attribute__((pure)) float __ocml_erfc_f32(float);
-__device__ __attribute__((pure)) float __ocml_erfcinv_f32(float);
-__device__ __attribute__((pure)) float __ocml_erfcx_f32(float);
-__device__ __attribute__((pure)) float __ocml_erf_f32(float);
-__device__ __attribute__((pure)) float __ocml_erfinv_f32(float);
-__device__ __attribute__((pure)) float __ocml_exp10_f32(float);
-__device__ __attribute__((pure)) float __ocml_native_exp10_f32(float);
-__device__ __attribute__((pure)) float __ocml_exp2_f32(float);
-__device__ __attribute__((pure)) float __ocml_exp_f32(float);
-__device__ __attribute__((pure)) float __ocml_native_exp_f32(float);
-__device__ __attribute__((pure)) float __ocml_expm1_f32(float);
-__device__ __attribute__((const)) float __ocml_fabs_f32(float);
-__device__ __attribute__((const)) float __ocml_fdim_f32(float, float);
-__device__ __attribute__((const)) float __ocml_floor_f32(float);
-__device__ __attribute__((const)) float __ocml_fma_f32(float, float, float);
-__device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
-__device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
-__device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float,
-                                                                   float);
-__device__ float __ocml_frexp_f32(float,
-                                  __attribute__((address_space(5))) int *);
-__device__ __attribute__((const)) float __ocml_hypot_f32(float, float);
-__device__ __attribute__((const)) int __ocml_ilogb_f32(float);
-__device__ __attribute__((const)) int __ocml_isfinite_f32(float);
-__device__ __attribute__((const)) int __ocml_isinf_f32(float);
-__device__ __attribute__((const)) int __ocml_isnan_f32(float);
-__device__ float __ocml_j0_f32(float);
-__device__ float __ocml_j1_f32(float);
-__device__ __attribute__((const)) float __ocml_ldexp_f32(float, int);
-__device__ float __ocml_lgamma_f32(float);
-__device__ __attribute__((pure)) float __ocml_log10_f32(float);
-__device__ __attribute__((pure)) float __ocml_native_log10_f32(float);
-__device__ __attribute__((pure)) float __ocml_log1p_f32(float);
-__device__ __attribute__((pure)) float __ocml_log2_f32(float);
-__device__ __attribute__((pure)) float __ocml_native_log2_f32(float);
-__device__ __attribute__((const)) float __ocml_logb_f32(float);
-__device__ __attribute__((pure)) float __ocml_log_f32(float);
-__device__ __attribute__((pure)) float __ocml_native_log_f32(float);
-__device__ float __ocml_modf_f32(float,
-                                 __attribute__((address_space(5))) float *);
-__device__ __attribute__((const)) float __ocml_nearbyint_f32(float);
-__device__ __attribute__((const)) float __ocml_nextafter_f32(float, float);
-__device__ __attribute__((const)) float __ocml_len3_f32(float, float, float);
-__device__ __attribute__((const)) float __ocml_len4_f32(float, float, float,
-                                                        float);
-__device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
-__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
-__device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
-__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
-__device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
-__device__ float __ocml_remquo_f32(float, float,
-                                   __attribute__((address_space(5))) int *);
-__device__ __attribute__((const)) float __ocml_rhypot_f32(float, float);
-__device__ __attribute__((const)) float __ocml_rint_f32(float);
-__device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
-__device__ __attribute__((const)) float __ocml_rlen4_f32(float, float, float,
-                                                         float);
-__device__ __attribute__((const)) float __ocml_round_f32(float);
-__device__ __attribute__((pure)) float __ocml_rsqrt_f32(float);
-__device__ __attribute__((const)) float __ocml_scalb_f32(float, float);
-__device__ __attribute__((const)) float __ocml_scalbn_f32(float, int);
-__device__ __attribute__((const)) int __ocml_signbit_f32(float);
-__device__ float __ocml_sincos_f32(float,
-                                   __attribute__((address_space(5))) float *);
-__device__ float __ocml_sincospi_f32(float,
-                                     __attribute__((address_space(5))) float *);
-__device__ float __ocml_sin_f32(float);
-__device__ float __ocml_native_sin_f32(float);
-__device__ __attribute__((pure)) float __ocml_sinh_f32(float);
-__device__ float __ocml_sinpi_f32(float);
-__device__ __attribute__((const)) float __ocml_sqrt_f32(float);
-__device__ __attribute__((const)) float __ocml_native_sqrt_f32(float);
-__device__ float __ocml_tan_f32(float);
-__device__ __attribute__((pure)) float __ocml_tanh_f32(float);
-__device__ float __ocml_tgamma_f32(float);
-__device__ __attribute__((const)) float __ocml_trunc_f32(float);
-__device__ float __ocml_y0_f32(float);
-__device__ float __ocml_y1_f32(float);
-
-// BEGIN INTRINSICS
-__device__ __attribute__((const)) float __ocml_add_rte_f32(float, float);
-__device__ __attribute__((const)) float __ocml_add_rtn_f32(float, float);
-__device__ __attribute__((const)) float __ocml_add_rtp_f32(float, float);
-__device__ __attribute__((const)) float __ocml_add_rtz_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sub_rte_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
-__device__ __attribute__((const)) float __ocml_mul_rte_f32(float, float);
-__device__ __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
-__device__ __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
-__device__ __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
-__device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
-__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
-__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
-__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float, float);
-__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
-__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
-__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
-__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
-
-__device__ __attribute__((const)) float
-__llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
-__device__ __attribute__((const)) float
-__llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
-__device__ __attribute__((const)) float
-__llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
-__device__ __attribute__((const)) float
-__llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
-// END INTRINSICS
-// END FLOAT
-
-// BEGIN DOUBLE
-__device__ __attribute__((const)) double __ocml_acos_f64(double);
-__device__ __attribute__((pure)) double __ocml_acosh_f64(double);
-__device__ __attribute__((const)) double __ocml_asin_f64(double);
-__device__ __attribute__((pure)) double __ocml_asinh_f64(double);
-__device__ __attribute__((const)) double __ocml_atan2_f64(double, double);
-__device__ __attribute__((const)) double __ocml_atan_f64(double);
-__device__ __attribute__((pure)) double __ocml_atanh_f64(double);
-__device__ __attribute__((pure)) double __ocml_cbrt_f64(double);
-__device__ __attribute__((const)) double __ocml_ceil_f64(double);
-__device__ __attribute__((const)) double __ocml_copysign_f64(double, double);
-__device__ double __ocml_cos_f64(double);
-__device__ __attribute__((pure)) double __ocml_cosh_f64(double);
-__device__ double __ocml_cospi_f64(double);
-__device__ double __ocml_i0_f64(double);
-__device__ double __ocml_i1_f64(double);
-__device__ __attribute__((pure)) double __ocml_erfc_f64(double);
-__device__ __attribute__((pure)) double __ocml_erfcinv_f64(double);
-__device__ __attribute__((pure)) double __ocml_erfcx_f64(double);
-__device__ __attribute__((pure)) double __ocml_erf_f64(double);
-__device__ __attribute__((pure)) double __ocml_erfinv_f64(double);
-__device__ __attribute__((pure)) double __ocml_exp10_f64(double);
-__device__ __attribute__((pure)) double __ocml_exp2_f64(double);
-__device__ __attribute__((pure)) double __ocml_exp_f64(double);
-__device__ __attribute__((pure)) double __ocml_expm1_f64(double);
-__device__ __attribute__((const)) double __ocml_fabs_f64(double);
-__device__ __attribute__((const)) double __ocml_fdim_f64(double, double);
-__device__ __attribute__((const)) double __ocml_floor_f64(double);
-__device__ __attribute__((const)) double __ocml_fma_f64(double, double, double);
-__device__ __attribute__((const)) double __ocml_fmax_f64(double, double);
-__device__ __attribute__((const)) double __ocml_fmin_f64(double, double);
-__device__ __attribute__((const)) double __ocml_fmod_f64(double, double);
-__device__ double __ocml_frexp_f64(double,
-                                   __attribute__((address_space(5))) int *);
-__device__ __attribute__((const)) double __ocml_hypot_f64(double, double);
-__device__ __attribute__((const)) int __ocml_ilogb_f64(double);
-__device__ __attribute__((const)) int __ocml_isfinite_f64(double);
-__device__ __attribute__((const)) int __ocml_isinf_f64(double);
-__device__ __attribute__((const)) int __ocml_isnan_f64(double);
-__device__ double __ocml_j0_f64(double);
-__device__ double __ocml_j1_f64(double);
-__device__ __attribute__((const)) double __ocml_ldexp_f64(double, int);
-__device__ double __ocml_lgamma_f64(double);
-__device__ __attribute__((pure)) double __ocml_log10_f64(double);
-__device__ __attribute__((pure)) double __ocml_log1p_f64(double);
-__device__ __attribute__((pure)) double __ocml_log2_f64(double);
-__device__ __attribute__((const)) double __ocml_logb_f64(double);
-__device__ __attribute__((pure)) double __ocml_log_f64(double);
-__device__ double __ocml_modf_f64(double,
-                                  __attribute__((address_space(5))) double *);
-__device__ __attribute__((const)) double __ocml_nearbyint_f64(double);
-__device__ __attribute__((const)) double __ocml_nextafter_f64(double, double);
-__device__ __attribute__((const)) double __ocml_len3_f64(double, double,
-                                                         double);
-__device__ __attribute__((const)) double __ocml_len4_f64(double, double, double,
-                                                         double);
-__device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
-__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
-__device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
-__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
-__device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
-__device__ double __ocml_remquo_f64(double, double,
-                                    __attribute__((address_space(5))) int *);
-__device__ __attribute__((const)) double __ocml_rhypot_f64(double, double);
-__device__ __attribute__((const)) double __ocml_rint_f64(double);
-__device__ __attribute__((const)) double __ocml_rlen3_f64(double, double,
-                                                          double);
-__device__ __attribute__((const)) double __ocml_rlen4_f64(double, double,
-                                                          double, double);
-__device__ __attribute__((const)) double __ocml_round_f64(double);
-__device__ __attribute__((pure)) double __ocml_rsqrt_f64(double);
-__device__ __attribute__((const)) double __ocml_scalb_f64(double, double);
-__device__ __attribute__((const)) double __ocml_scalbn_f64(double, int);
-__device__ __attribute__((const)) int __ocml_signbit_f64(double);
-__device__ double __ocml_sincos_f64(double,
-                                    __attribute__((address_space(5))) double *);
-__device__ double
-__ocml_sincospi_f64(double, __attribute__((address_space(5))) double *);
-__device__ double __ocml_sin_f64(double);
-__device__ __attribute__((pure)) double __ocml_sinh_f64(double);
-__device__ double __ocml_sinpi_f64(double);
-__device__ __attribute__((const)) double __ocml_sqrt_f64(double);
-__device__ double __ocml_tan_f64(double);
-__device__ __attribute__((pure)) double __ocml_tanh_f64(double);
-__device__ double __ocml_tgamma_f64(double);
-__device__ __attribute__((const)) double __ocml_trunc_f64(double);
-__device__ double __ocml_y0_f64(double);
-__device__ double __ocml_y1_f64(double);
-
-// BEGIN INTRINSICS
-__device__ __attribute__((const)) double __ocml_add_rte_f64(double, double);
-__device__ __attribute__((const)) double __ocml_add_rtn_f64(double, double);
-__device__ __attribute__((const)) double __ocml_add_rtp_f64(double, double);
-__device__ __attribute__((const)) double __ocml_add_rtz_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sub_rte_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
-__device__ __attribute__((const)) double __ocml_mul_rte_f64(double, double);
-__device__ __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
-__device__ __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
-__device__ __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
-__device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
-__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
-__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
-__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double, double);
-__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double,
-                                                            double);
-__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
-                                                            double);
-__device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
-                                                            double);
-__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
-                                                            double);
-
-__device__ __attribute__((const)) double
-__llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64");
-__device__ __attribute__((const)) double
-__llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
-
-__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
-__device__ _Float16 __ocml_cos_f16(_Float16);
-__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
-__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
-__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
-                                                          _Float16);
-__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
-__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
-__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
-__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
-__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
-__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
-__device__ _Float16 __ocml_sin_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
-__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
-
-typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
-typedef short __2i16 __attribute__((ext_vector_type(2)));
-
-__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
-                                                     float c, bool s);
-__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
-__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
-__device__ __2f16 __ocml_cos_2f16(__2f16);
-__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
-__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
-__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
-__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
-__device__ __attribute__((const))
-__2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
-__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
-__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
-__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
-__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
-__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
-__device__ inline __2f16
-__llvm_amdgcn_rcp_2f16(__2f16 __x) // Not currently exposed by ROCDL.
-{
-  return __2f16{__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)};
-}
-__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
-__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
-__device__ __2f16 __ocml_sin_2f16(__2f16);
-__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
-__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
-
-} // extern "C"
-
-#endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__
diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h
deleted file mode 100644
index fcc9a3bdbe17f..0000000000000
--- a/clang/lib/Headers/__clang_hip_math.h
+++ /dev/null
@@ -1,1185 +0,0 @@
-/*===---- __clang_hip_math.h - HIP math decls -------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CLANG_HIP_MATH_H__
-#define __CLANG_HIP_MATH_H__
-
-#include <algorithm>
-#include <limits.h>
-#include <limits>
-#include <stdint.h>
-
-#pragma push_macro("__DEVICE__")
-#pragma push_macro("__RETURN_TYPE")
-
-// to be consistent with __clang_cuda_math_forward_declares
-#define __DEVICE__ static __device__
-#define __RETURN_TYPE bool
-
-__DEVICE__
-inline uint64_t __make_mantissa_base8(const char *__tagp) {
-  uint64_t __r = 0;
-  while (__tagp) {
-    char __tmp = *__tagp;
-
-    if (__tmp >= '0' && __tmp <= '7')
-      __r = (__r * 8u) + __tmp - '0';
-    else
-      return 0;
-
-    ++__tagp;
-  }
-
-  return __r;
-}
-
-__DEVICE__
-inline uint64_t __make_mantissa_base10(const char *__tagp) {
-  uint64_t __r = 0;
-  while (__tagp) {
-    char __tmp = *__tagp;
-
-    if (__tmp >= '0' && __tmp <= '9')
-      __r = (__r * 10u) + __tmp - '0';
-    else
-      return 0;
-
-    ++__tagp;
-  }
-
-  return __r;
-}
-
-__DEVICE__
-inline uint64_t __make_mantissa_base16(const char *__tagp) {
-  uint64_t __r = 0;
-  while (__tagp) {
-    char __tmp = *__tagp;
-
-    if (__tmp >= '0' && __tmp <= '9')
-      __r = (__r * 16u) + __tmp - '0';
-    else if (__tmp >= 'a' && __tmp <= 'f')
-      __r = (__r * 16u) + __tmp - 'a' + 10;
-    else if (__tmp >= 'A' && __tmp <= 'F')
-      __r = (__r * 16u) + __tmp - 'A' + 10;
-    else
-      return 0;
-
-    ++__tagp;
-  }
-
-  return __r;
-}
-
-__DEVICE__
-inline uint64_t __make_mantissa(const char *__tagp) {
-  if (!__tagp)
-    return 0u;
-
-  if (*__tagp == '0') {
-    ++__tagp;
-
-    if (*__tagp == 'x' || *__tagp == 'X')
-      return __make_mantissa_base16(__tagp);
-    else
-      return __make_mantissa_base8(__tagp);
-  }
-
-  return __make_mantissa_base10(__tagp);
-}
-
-// BEGIN FLOAT
-__DEVICE__
-inline float abs(float __x) { return __ocml_fabs_f32(__x); }
-__DEVICE__
-inline float acosf(float __x) { return __ocml_acos_f32(__x); }
-__DEVICE__
-inline float acoshf(float __x) { return __ocml_acosh_f32(__x); }
-__DEVICE__
-inline float asinf(float __x) { return __ocml_asin_f32(__x); }
-__DEVICE__
-inline float asinhf(float __x) { return __ocml_asinh_f32(__x); }
-__DEVICE__
-inline float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); }
-__DEVICE__
-inline float atanf(float __x) { return __ocml_atan_f32(__x); }
-__DEVICE__
-inline float atanhf(float __x) { return __ocml_atanh_f32(__x); }
-__DEVICE__
-inline float cbrtf(float __x) { return __ocml_cbrt_f32(__x); }
-__DEVICE__
-inline float ceilf(float __x) { return __ocml_ceil_f32(__x); }
-__DEVICE__
-inline float copysignf(float __x, float __y) {
-  return __ocml_copysign_f32(__x, __y);
-}
-__DEVICE__
-inline float cosf(float __x) { return __ocml_cos_f32(__x); }
-__DEVICE__
-inline float coshf(float __x) { return __ocml_cosh_f32(__x); }
-__DEVICE__
-inline float cospif(float __x) { return __ocml_cospi_f32(__x); }
-__DEVICE__
-inline float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); }
-__DEVICE__
-inline float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); }
-__DEVICE__
-inline float erfcf(float __x) { return __ocml_erfc_f32(__x); }
-__DEVICE__
-inline float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); }
-__DEVICE__
-inline float erfcxf(float __x) { return __ocml_erfcx_f32(__x); }
-__DEVICE__
-inline float erff(float __x) { return __ocml_erf_f32(__x); }
-__DEVICE__
-inline float erfinvf(float __x) { return __ocml_erfinv_f32(__x); }
-__DEVICE__
-inline float exp10f(float __x) { return __ocml_exp10_f32(__x); }
-__DEVICE__
-inline float exp2f(float __x) { return __ocml_exp2_f32(__x); }
-__DEVICE__
-inline float expf(float __x) { return __ocml_exp_f32(__x); }
-__DEVICE__
-inline float expm1f(float __x) { return __ocml_expm1_f32(__x); }
-__DEVICE__
-inline float fabsf(float __x) { return __ocml_fabs_f32(__x); }
-__DEVICE__
-inline float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
-__DEVICE__
-inline float fdividef(float __x, float __y) { return __x / __y; }
-__DEVICE__
-inline float floorf(float __x) { return __ocml_floor_f32(__x); }
-__DEVICE__
-inline float fmaf(float __x, float __y, float __z) {
-  return __ocml_fma_f32(__x, __y, __z);
-}
-__DEVICE__
-inline float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); }
-__DEVICE__
-inline float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); }
-__DEVICE__
-inline float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
-__DEVICE__
-inline float frexpf(float __x, int *__nptr) {
-  int __tmp;
-  float __r =
-      __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
-  *__nptr = __tmp;
-
-  return __r;
-}
-__DEVICE__
-inline float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); }
-__DEVICE__
-inline int ilogbf(float __x) { return __ocml_ilogb_f32(__x); }
-__DEVICE__
-inline __RETURN_TYPE isfinite(float __x) { return __ocml_isfinite_f32(__x); }
-__DEVICE__
-inline __RETURN_TYPE isinf(float __x) { return __ocml_isinf_f32(__x); }
-__DEVICE__
-inline __RETURN_TYPE isnan(float __x) { return __ocml_isnan_f32(__x); }
-__DEVICE__
-inline float j0f(float __x) { return __ocml_j0_f32(__x); }
-__DEVICE__
-inline float j1f(float __x) { return __ocml_j1_f32(__x); }
-__DEVICE__
-inline float jnf(int __n,
-                 float __x) { // TODO: we could use Ahmes multiplication
-                              // and the Miller & Brown algorithm
-  //       for linear recurrences to get O(log n) steps, but it's unclear if
-  //       it'd be beneficial in this case.
-  if (__n == 0)
-    return j0f(__x);
-  if (__n == 1)
-    return j1f(__x);
-
-  float __x0 = j0f(__x);
-  float __x1 = j1f(__x);
-  for (int __i = 1; __i < __n; ++__i) {
-    float __x2 = (2 * __i) / __x * __x1 - __x0;
-    __x0 = __x1;
-    __x1 = __x2;
-  }
-
-  return __x1;
-}
-__DEVICE__
-inline float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); }
-__DEVICE__
-inline float lgammaf(float __x) { return __ocml_lgamma_f32(__x); }
-__DEVICE__
-inline long long int llrintf(float __x) { return __ocml_rint_f32(__x); }
-__DEVICE__
-inline long long int llroundf(float __x) { return __ocml_round_f32(__x); }
-__DEVICE__
-inline float log10f(float __x) { return __ocml_log10_f32(__x); }
-__DEVICE__
-inline float log1pf(float __x) { return __ocml_log1p_f32(__x); }
-__DEVICE__
-inline float log2f(float __x) { return __ocml_log2_f32(__x); }
-__DEVICE__
-inline float logbf(float __x) { return __ocml_logb_f32(__x); }
-__DEVICE__
-inline float logf(float __x) { return __ocml_log_f32(__x); }
-__DEVICE__
-inline long int lrintf(float __x) { return __ocml_rint_f32(__x); }
-__DEVICE__
-inline long int lroundf(float __x) { return __ocml_round_f32(__x); }
-__DEVICE__
-inline float modff(float __x, float *__iptr) {
-  float __tmp;
-  float __r =
-      __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
-  *__iptr = __tmp;
-
-  return __r;
-}
-__DEVICE__
-inline float nanf(const char *__tagp) {
-  union {
-    float val;
-    struct ieee_float {
-      uint32_t mantissa : 22;
-      uint32_t quiet : 1;
-      uint32_t exponent : 8;
-      uint32_t sign : 1;
-    } bits;
-
-    static_assert(sizeof(float) == sizeof(ieee_float), "");
-  } __tmp;
-
-  __tmp.bits.sign = 0u;
-  __tmp.bits.exponent = ~0u;
-  __tmp.bits.quiet = 1u;
-  __tmp.bits.mantissa = __make_mantissa(__tagp);
-
-  return __tmp.val;
-}
-__DEVICE__
-inline float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); }
-__DEVICE__
-inline float nextafterf(float __x, float __y) {
-  return __ocml_nextafter_f32(__x, __y);
-}
-__DEVICE__
-inline float norm3df(float __x, float __y, float __z) {
-  return __ocml_len3_f32(__x, __y, __z);
-}
-__DEVICE__
-inline float norm4df(float __x, float __y, float __z, float __w) {
-  return __ocml_len4_f32(__x, __y, __z, __w);
-}
-__DEVICE__
-inline float normcdff(float __x) { return __ocml_ncdf_f32(__x); }
-__DEVICE__
-inline float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }
-__DEVICE__
-inline float
-normf(int __dim,
-      const float *__a) { // TODO: placeholder until OCML adds support.
-  float __r = 0;
-  while (__dim--) {
-    __r += __a[0] * __a[0];
-    ++__a;
-  }
-
-  return __ocml_sqrt_f32(__r);
-}
-__DEVICE__
-inline float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
-__DEVICE__
-inline float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); }
-__DEVICE__
-inline float remainderf(float __x, float __y) {
-  return __ocml_remainder_f32(__x, __y);
-}
-__DEVICE__
-inline float remquof(float __x, float __y, int *__quo) {
-  int __tmp;
-  float __r = __ocml_remquo_f32(
-      __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
-  *__quo = __tmp;
-
-  return __r;
-}
-__DEVICE__
-inline float rhypotf(float __x, float __y) {
-  return __ocml_rhypot_f32(__x, __y);
-}
-__DEVICE__
-inline float rintf(float __x) { return __ocml_rint_f32(__x); }
-__DEVICE__
-inline float rnorm3df(float __x, float __y, float __z) {
-  return __ocml_rlen3_f32(__x, __y, __z);
-}
-
-__DEVICE__
-inline float rnorm4df(float __x, float __y, float __z, float __w) {
-  return __ocml_rlen4_f32(__x, __y, __z, __w);
-}
-__DEVICE__
-inline float
-rnormf(int __dim,
-       const float *__a) { // TODO: placeholder until OCML adds support.
-  float __r = 0;
-  while (__dim--) {
-    __r += __a[0] * __a[0];
-    ++__a;
-  }
-
-  return __ocml_rsqrt_f32(__r);
-}
-__DEVICE__
-inline float roundf(float __x) { return __ocml_round_f32(__x); }
-__DEVICE__
-inline float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); }
-__DEVICE__
-inline float scalblnf(float __x, long int __n) {
-  return (__n < INT_MAX) ? __ocml_scalbn_f32(__x, __n)
-                         : __ocml_scalb_f32(__x, __n);
-}
-__DEVICE__
-inline float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); }
-__DEVICE__
-inline __RETURN_TYPE signbit(float __x) { return __ocml_signbit_f32(__x); }
-__DEVICE__
-inline void sincosf(float __x, float *__sptr, float *__cptr) {
-  float __tmp;
-
-  *__sptr =
-      __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
-  *__cptr = __tmp;
-}
-__DEVICE__
-inline void sincospif(float __x, float *__sptr, float *__cptr) {
-  float __tmp;
-
-  *__sptr = __ocml_sincospi_f32(
-      __x, (__attribute__((address_space(5))) float *)&__tmp);
-  *__cptr = __tmp;
-}
-__DEVICE__
-inline float sinf(float __x) { return __ocml_sin_f32(__x); }
-__DEVICE__
-inline float sinhf(float __x) { return __ocml_sinh_f32(__x); }
-__DEVICE__
-inline float sinpif(float __x) { return __ocml_sinpi_f32(__x); }
-__DEVICE__
-inline float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }
-__DEVICE__
-inline float tanf(float __x) { return __ocml_tan_f32(__x); }
-__DEVICE__
-inline float tanhf(float __x) { return __ocml_tanh_f32(__x); }
-__DEVICE__
-inline float tgammaf(float __x) { return __ocml_tgamma_f32(__x); }
-__DEVICE__
-inline float truncf(float __x) { return __ocml_trunc_f32(__x); }
-__DEVICE__
-inline float y0f(float __x) { return __ocml_y0_f32(__x); }
-__DEVICE__
-inline float y1f(float __x) { return __ocml_y1_f32(__x); }
-__DEVICE__
-inline float ynf(int __n,
-                 float __x) { // TODO: we could use Ahmes multiplication
-                              // and the Miller & Brown algorithm
-  //       for linear recurrences to get O(log n) steps, but it's unclear if
-  //       it'd be beneficial in this case. Placeholder until OCML adds
-  //       support.
-  if (__n == 0)
-    return y0f(__x);
-  if (__n == 1)
-    return y1f(__x);
-
-  float __x0 = y0f(__x);
-  float __x1 = y1f(__x);
-  for (int __i = 1; __i < __n; ++__i) {
-    float __x2 = (2 * __i) / __x * __x1 - __x0;
-    __x0 = __x1;
-    __x1 = __x2;
-  }
-
-  return __x1;
-}
-
-// BEGIN INTRINSICS
-__DEVICE__
-inline float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
-__DEVICE__
-inline float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); }
-__DEVICE__
-inline float __expf(float __x) { return __ocml_native_exp_f32(__x); }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline float __fadd_rd(float __x, float __y) {
-  return __ocml_add_rtn_f32(__x, __y);
-}
-#endif
-__DEVICE__
-inline float __fadd_rn(float __x, float __y) { return __x + __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline float __fadd_ru(float __x, float __y) {
-  return __ocml_add_rtp_f32(__x, __y);
-}
-__DEVICE__
-inline float __fadd_rz(float __x, float __y) {
-  return __ocml_add_rtz_f32(__x, __y);
-}
-__DEVICE__
-inline float __fdiv_rd(float __x, float __y) {
-  return __ocml_div_rtn_f32(__x, __y);
-}
-#endif
-__DEVICE__
-inline float __fdiv_rn(float __x, float __y) { return __x / __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline float __fdiv_ru(float __x, float __y) {
-  return __ocml_div_rtp_f32(__x, __y);
-}
-__DEVICE__
-inline float __fdiv_rz(float __x, float __y) {
-  return __ocml_div_rtz_f32(__x, __y);
-}
-#endif
-__DEVICE__
-inline float __fdividef(float __x, float __y) { return __x / __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline float __fmaf_rd(float __x, float __y, float __z) {
-  return __ocml_fma_rtn_f32(__x, __y, __z);
-}
-#endif
-__DEVICE__
-inline float __fmaf_rn(float __x, float __y, float __z) {
-  return __ocml_fma_f32(__x, __y, __z);
-}
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline float __fmaf_ru(float __x, float __y, float __z) {
-  return __ocml_fma_rtp_f32(__x, __y, __z);
-}
-__DEVICE__
-inline float __fmaf_rz(float __x, float __y, float __z) {
-  return __ocml_fma_rtz_f32(__x, __y, __z);
-}
-__DEVICE__
-inline float __fmul_rd(float __x, float __y) {
-  return __ocml_mul_rtn_f32(__x, __y);
-}
-#endif
-__DEVICE__
-inline float __fmul_rn(float __x, float __y) { return __x * __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline float __fmul_ru(float __x, float __y) {
-  return __ocml_mul_rtp_f32(__x, __y);
-}
-__DEVICE__
-inline float __fmul_rz(float __x, float __y) {
-  return __ocml_mul_rtz_f32(__x, __y);
-}
-__DEVICE__
-inline float __frcp_rd(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
-#endif
-__DEVICE__
-inline float __frcp_rn(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline float __frcp_ru(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
-__DEVICE__
-inline float __frcp_rz(float __x) { return __llvm_amdgcn_rcp_f32(__x); }
-#endif
-__DEVICE__
-inline float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); }
-#endif
-__DEVICE__
-inline float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); }
-__DEVICE__
-inline float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); }
-__DEVICE__
-inline float __fsub_rd(float __x, float __y) {
-  return __ocml_sub_rtn_f32(__x, __y);
-}
-#endif
-__DEVICE__
-inline float __fsub_rn(float __x, float __y) { return __x - __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline float __fsub_ru(float __x, float __y) {
-  return __ocml_sub_rtp_f32(__x, __y);
-}
-__DEVICE__
-inline float __fsub_rz(float __x, float __y) {
-  return __ocml_sub_rtz_f32(__x, __y);
-}
-#endif
-__DEVICE__
-inline float __log10f(float __x) { return __ocml_native_log10_f32(__x); }
-__DEVICE__
-inline float __log2f(float __x) { return __ocml_native_log2_f32(__x); }
-__DEVICE__
-inline float __logf(float __x) { return __ocml_native_log_f32(__x); }
-__DEVICE__
-inline float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
-__DEVICE__
-inline float __saturatef(float __x) {
-  return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x);
-}
-__DEVICE__
-inline void __sincosf(float __x, float *__sptr, float *__cptr) {
-  *__sptr = __ocml_native_sin_f32(__x);
-  *__cptr = __ocml_native_cos_f32(__x);
-}
-__DEVICE__
-inline float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
-__DEVICE__
-inline float __tanf(float __x) { return __ocml_tan_f32(__x); }
-// END INTRINSICS
-// END FLOAT
-
-// BEGIN DOUBLE
-__DEVICE__
-inline double abs(double __x) { return __ocml_fabs_f64(__x); }
-__DEVICE__
-inline double acos(double __x) { return __ocml_acos_f64(__x); }
-__DEVICE__
-inline double acosh(double __x) { return __ocml_acosh_f64(__x); }
-__DEVICE__
-inline double asin(double __x) { return __ocml_asin_f64(__x); }
-__DEVICE__
-inline double asinh(double __x) { return __ocml_asinh_f64(__x); }
-__DEVICE__
-inline double atan(double __x) { return __ocml_atan_f64(__x); }
-__DEVICE__
-inline double atan2(double __x, double __y) {
-  return __ocml_atan2_f64(__x, __y);
-}
-__DEVICE__
-inline double atanh(double __x) { return __ocml_atanh_f64(__x); }
-__DEVICE__
-inline double cbrt(double __x) { return __ocml_cbrt_f64(__x); }
-__DEVICE__
-inline double ceil(double __x) { return __ocml_ceil_f64(__x); }
-__DEVICE__
-inline double copysign(double __x, double __y) {
-  return __ocml_copysign_f64(__x, __y);
-}
-__DEVICE__
-inline double cos(double __x) { return __ocml_cos_f64(__x); }
-__DEVICE__
-inline double cosh(double __x) { return __ocml_cosh_f64(__x); }
-__DEVICE__
-inline double cospi(double __x) { return __ocml_cospi_f64(__x); }
-__DEVICE__
-inline double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); }
-__DEVICE__
-inline double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); }
-__DEVICE__
-inline double erf(double __x) { return __ocml_erf_f64(__x); }
-__DEVICE__
-inline double erfc(double __x) { return __ocml_erfc_f64(__x); }
-__DEVICE__
-inline double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); }
-__DEVICE__
-inline double erfcx(double __x) { return __ocml_erfcx_f64(__x); }
-__DEVICE__
-inline double erfinv(double __x) { return __ocml_erfinv_f64(__x); }
-__DEVICE__
-inline double exp(double __x) { return __ocml_exp_f64(__x); }
-__DEVICE__
-inline double exp10(double __x) { return __ocml_exp10_f64(__x); }
-__DEVICE__
-inline double exp2(double __x) { return __ocml_exp2_f64(__x); }
-__DEVICE__
-inline double expm1(double __x) { return __ocml_expm1_f64(__x); }
-__DEVICE__
-inline double fabs(double __x) { return __ocml_fabs_f64(__x); }
-__DEVICE__
-inline double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
-__DEVICE__
-inline double floor(double __x) { return __ocml_floor_f64(__x); }
-__DEVICE__
-inline double fma(double __x, double __y, double __z) {
-  return __ocml_fma_f64(__x, __y, __z);
-}
-__DEVICE__
-inline double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); }
-__DEVICE__
-inline double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); }
-__DEVICE__
-inline double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
-__DEVICE__
-inline double frexp(double __x, int *__nptr) {
-  int __tmp;
-  double __r =
-      __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
-  *__nptr = __tmp;
-
-  return __r;
-}
-__DEVICE__
-inline double hypot(double __x, double __y) {
-  return __ocml_hypot_f64(__x, __y);
-}
-__DEVICE__
-inline int ilogb(double __x) { return __ocml_ilogb_f64(__x); }
-__DEVICE__
-inline __RETURN_TYPE isfinite(double __x) { return __ocml_isfinite_f64(__x); }
-__DEVICE__
-inline __RETURN_TYPE isinf(double __x) { return __ocml_isinf_f64(__x); }
-__DEVICE__
-inline __RETURN_TYPE isnan(double __x) { return __ocml_isnan_f64(__x); }
-__DEVICE__
-inline double j0(double __x) { return __ocml_j0_f64(__x); }
-__DEVICE__
-inline double j1(double __x) { return __ocml_j1_f64(__x); }
-__DEVICE__
-inline double jn(int __n,
-                 double __x) { // TODO: we could use Ahmes multiplication
-                               // and the Miller & Brown algorithm
-  //       for linear recurrences to get O(log n) steps, but it's unclear if
-  //       it'd be beneficial in this case. Placeholder until OCML adds
-  //       support.
-  if (__n == 0)
-    return j0f(__x);
-  if (__n == 1)
-    return j1f(__x);
-
-  double __x0 = j0f(__x);
-  double __x1 = j1f(__x);
-  for (int __i = 1; __i < __n; ++__i) {
-    double __x2 = (2 * __i) / __x * __x1 - __x0;
-    __x0 = __x1;
-    __x1 = __x2;
-  }
-
-  return __x1;
-}
-__DEVICE__
-inline double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); }
-__DEVICE__
-inline double lgamma(double __x) { return __ocml_lgamma_f64(__x); }
-__DEVICE__
-inline long long int llrint(double __x) { return __ocml_rint_f64(__x); }
-__DEVICE__
-inline long long int llround(double __x) { return __ocml_round_f64(__x); }
-__DEVICE__
-inline double log(double __x) { return __ocml_log_f64(__x); }
-__DEVICE__
-inline double log10(double __x) { return __ocml_log10_f64(__x); }
-__DEVICE__
-inline double log1p(double __x) { return __ocml_log1p_f64(__x); }
-__DEVICE__
-inline double log2(double __x) { return __ocml_log2_f64(__x); }
-__DEVICE__
-inline double logb(double __x) { return __ocml_logb_f64(__x); }
-__DEVICE__
-inline long int lrint(double __x) { return __ocml_rint_f64(__x); }
-__DEVICE__
-inline long int lround(double __x) { return __ocml_round_f64(__x); }
-__DEVICE__
-inline double modf(double __x, double *__iptr) {
-  double __tmp;
-  double __r =
-      __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
-  *__iptr = __tmp;
-
-  return __r;
-}
-__DEVICE__
-inline double nan(const char *__tagp) {
-#if !_WIN32
-  union {
-    double val;
-    struct ieee_double {
-      uint64_t mantissa : 51;
-      uint32_t quiet : 1;
-      uint32_t exponent : 11;
-      uint32_t sign : 1;
-    } bits;
-    static_assert(sizeof(double) == sizeof(ieee_double), "");
-  } __tmp;
-
-  __tmp.bits.sign = 0u;
-  __tmp.bits.exponent = ~0u;
-  __tmp.bits.quiet = 1u;
-  __tmp.bits.mantissa = __make_mantissa(__tagp);
-
-  return __tmp.val;
-#else
-  static_assert(sizeof(uint64_t) == sizeof(double));
-  uint64_t val = __make_mantissa(__tagp);
-  val |= 0xFFF << 51;
-  return *reinterpret_cast<double *>(&val);
-#endif
-}
-__DEVICE__
-inline double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); }
-__DEVICE__
-inline double nextafter(double __x, double __y) {
-  return __ocml_nextafter_f64(__x, __y);
-}
-__DEVICE__
-inline double
-norm(int __dim,
-     const double *__a) { // TODO: placeholder until OCML adds support.
-  double __r = 0;
-  while (__dim--) {
-    __r += __a[0] * __a[0];
-    ++__a;
-  }
-
-  return __ocml_sqrt_f64(__r);
-}
-__DEVICE__
-inline double norm3d(double __x, double __y, double __z) {
-  return __ocml_len3_f64(__x, __y, __z);
-}
-__DEVICE__
-inline double norm4d(double __x, double __y, double __z, double __w) {
-  return __ocml_len4_f64(__x, __y, __z, __w);
-}
-__DEVICE__
-inline double normcdf(double __x) { return __ocml_ncdf_f64(__x); }
-__DEVICE__
-inline double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); }
-__DEVICE__
-inline double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); }
-__DEVICE__
-inline double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); }
-__DEVICE__
-inline double remainder(double __x, double __y) {
-  return __ocml_remainder_f64(__x, __y);
-}
-__DEVICE__
-inline double remquo(double __x, double __y, int *__quo) {
-  int __tmp;
-  double __r = __ocml_remquo_f64(
-      __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
-  *__quo = __tmp;
-
-  return __r;
-}
-__DEVICE__
-inline double rhypot(double __x, double __y) {
-  return __ocml_rhypot_f64(__x, __y);
-}
-__DEVICE__
-inline double rint(double __x) { return __ocml_rint_f64(__x); }
-__DEVICE__
-inline double
-rnorm(int __dim,
-      const double *__a) { // TODO: placeholder until OCML adds support.
-  double __r = 0;
-  while (__dim--) {
-    __r += __a[0] * __a[0];
-    ++__a;
-  }
-
-  return __ocml_rsqrt_f64(__r);
-}
-__DEVICE__
-inline double rnorm3d(double __x, double __y, double __z) {
-  return __ocml_rlen3_f64(__x, __y, __z);
-}
-__DEVICE__
-inline double rnorm4d(double __x, double __y, double __z, double __w) {
-  return __ocml_rlen4_f64(__x, __y, __z, __w);
-}
-__DEVICE__
-inline double round(double __x) { return __ocml_round_f64(__x); }
-__DEVICE__
-inline double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); }
-__DEVICE__
-inline double scalbln(double __x, long int __n) {
-  return (__n < INT_MAX) ? __ocml_scalbn_f64(__x, __n)
-                         : __ocml_scalb_f64(__x, __n);
-}
-__DEVICE__
-inline double scalbn(double __x, int __n) {
-  return __ocml_scalbn_f64(__x, __n);
-}
-__DEVICE__
-inline __RETURN_TYPE signbit(double __x) { return __ocml_signbit_f64(__x); }
-__DEVICE__
-inline double sin(double __x) { return __ocml_sin_f64(__x); }
-__DEVICE__
-inline void sincos(double __x, double *__sptr, double *__cptr) {
-  double __tmp;
-  *__sptr = __ocml_sincos_f64(
-      __x, (__attribute__((address_space(5))) double *)&__tmp);
-  *__cptr = __tmp;
-}
-__DEVICE__
-inline void sincospi(double __x, double *__sptr, double *__cptr) {
-  double __tmp;
-  *__sptr = __ocml_sincospi_f64(
-      __x, (__attribute__((address_space(5))) double *)&__tmp);
-  *__cptr = __tmp;
-}
-__DEVICE__
-inline double sinh(double __x) { return __ocml_sinh_f64(__x); }
-__DEVICE__
-inline double sinpi(double __x) { return __ocml_sinpi_f64(__x); }
-__DEVICE__
-inline double sqrt(double __x) { return __ocml_sqrt_f64(__x); }
-__DEVICE__
-inline double tan(double __x) { return __ocml_tan_f64(__x); }
-__DEVICE__
-inline double tanh(double __x) { return __ocml_tanh_f64(__x); }
-__DEVICE__
-inline double tgamma(double __x) { return __ocml_tgamma_f64(__x); }
-__DEVICE__
-inline double trunc(double __x) { return __ocml_trunc_f64(__x); }
-__DEVICE__
-inline double y0(double __x) { return __ocml_y0_f64(__x); }
-__DEVICE__
-inline double y1(double __x) { return __ocml_y1_f64(__x); }
-__DEVICE__
-inline double yn(int __n,
-                 double __x) { // TODO: we could use Ahmes multiplication
-                               // and the Miller & Brown algorithm
-  //       for linear recurrences to get O(log n) steps, but it's unclear if
-  //       it'd be beneficial in this case. Placeholder until OCML adds
-  //       support.
-  if (__n == 0)
-    return j0f(__x);
-  if (__n == 1)
-    return j1f(__x);
-
-  double __x0 = j0f(__x);
-  double __x1 = j1f(__x);
-  for (int __i = 1; __i < __n; ++__i) {
-    double __x2 = (2 * __i) / __x * __x1 - __x0;
-    __x0 = __x1;
-    __x1 = __x2;
-  }
-
-  return __x1;
-}
-
-// BEGIN INTRINSICS
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline double __dadd_rd(double __x, double __y) {
-  return __ocml_add_rtn_f64(__x, __y);
-}
-#endif
-__DEVICE__
-inline double __dadd_rn(double __x, double __y) { return __x + __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline double __dadd_ru(double __x, double __y) {
-  return __ocml_add_rtp_f64(__x, __y);
-}
-__DEVICE__
-inline double __dadd_rz(double __x, double __y) {
-  return __ocml_add_rtz_f64(__x, __y);
-}
-__DEVICE__
-inline double __ddiv_rd(double __x, double __y) {
-  return __ocml_div_rtn_f64(__x, __y);
-}
-#endif
-__DEVICE__
-inline double __ddiv_rn(double __x, double __y) { return __x / __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline double __ddiv_ru(double __x, double __y) {
-  return __ocml_div_rtp_f64(__x, __y);
-}
-__DEVICE__
-inline double __ddiv_rz(double __x, double __y) {
-  return __ocml_div_rtz_f64(__x, __y);
-}
-__DEVICE__
-inline double __dmul_rd(double __x, double __y) {
-  return __ocml_mul_rtn_f64(__x, __y);
-}
-#endif
-__DEVICE__
-inline double __dmul_rn(double __x, double __y) { return __x * __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline double __dmul_ru(double __x, double __y) {
-  return __ocml_mul_rtp_f64(__x, __y);
-}
-__DEVICE__
-inline double __dmul_rz(double __x, double __y) {
-  return __ocml_mul_rtz_f64(__x, __y);
-}
-__DEVICE__
-inline double __drcp_rd(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
-#endif
-__DEVICE__
-inline double __drcp_rn(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline double __drcp_ru(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
-__DEVICE__
-inline double __drcp_rz(double __x) { return __llvm_amdgcn_rcp_f64(__x); }
-__DEVICE__
-inline double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); }
-#endif
-__DEVICE__
-inline double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); }
-__DEVICE__
-inline double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); }
-__DEVICE__
-inline double __dsub_rd(double __x, double __y) {
-  return __ocml_sub_rtn_f64(__x, __y);
-}
-#endif
-__DEVICE__
-inline double __dsub_rn(double __x, double __y) { return __x - __y; }
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline double __dsub_ru(double __x, double __y) {
-  return __ocml_sub_rtp_f64(__x, __y);
-}
-__DEVICE__
-inline double __dsub_rz(double __x, double __y) {
-  return __ocml_sub_rtz_f64(__x, __y);
-}
-__DEVICE__
-inline double __fma_rd(double __x, double __y, double __z) {
-  return __ocml_fma_rtn_f64(__x, __y, __z);
-}
-#endif
-__DEVICE__
-inline double __fma_rn(double __x, double __y, double __z) {
-  return __ocml_fma_f64(__x, __y, __z);
-}
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-inline double __fma_ru(double __x, double __y, double __z) {
-  return __ocml_fma_rtp_f64(__x, __y, __z);
-}
-__DEVICE__
-inline double __fma_rz(double __x, double __y, double __z) {
-  return __ocml_fma_rtz_f64(__x, __y, __z);
-}
-#endif
-// END INTRINSICS
-// END DOUBLE
-
-// BEGIN INTEGER
-__DEVICE__
-inline int abs(int __x) {
-  int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
-}
-__DEVICE__
-inline long labs(long __x) {
-  long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
-}
-__DEVICE__
-inline long long llabs(long long __x) {
-  long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
-}
-
-#if defined(__cplusplus)
-__DEVICE__
-inline long abs(long __x) { return labs(__x); }
-__DEVICE__
-inline long long abs(long long __x) { return llabs(__x); }
-#endif
-// END INTEGER
-
-__DEVICE__
-inline _Float16 fma(_Float16 __x, _Float16 __y, _Float16 __z) {
-  return __ocml_fma_f16(__x, __y, __z);
-}
-
-__DEVICE__
-inline float fma(float __x, float __y, float __z) {
-  return fmaf(__x, __y, __z);
-}
-
-#pragma push_macro("__DEF_FUN1")
-#pragma push_macro("__DEF_FUN2")
-#pragma push_macro("__DEF_FUNI")
-#pragma push_macro("__DEF_FLOAT_FUN2I")
-#pragma push_macro("__HIP_OVERLOAD1")
-#pragma push_macro("__HIP_OVERLOAD2")
-
-// __hip_enable_if::type is a type function which returns __T if __B is true.
-template <bool __B, class __T = void> struct __hip_enable_if {};
-
-template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
-
-// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
-// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
-// floor(double).
-#define __HIP_OVERLOAD1(__retty, __fn)                                         \
-  template <typename __T>                                                      \
-  __DEVICE__ typename __hip_enable_if<std::numeric_limits<__T>::is_integer,    \
-                                      __retty>::type                           \
-  __fn(__T __x) {                                                              \
-    return ::__fn((double)__x);                                                \
-  }
-
-// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
-// or integer argument to avoid compilation error due to ambibuity. e.g.
-// max(5.0f, 6.0) is resolved with max(double, double).
-#define __HIP_OVERLOAD2(__retty, __fn)                                         \
-  template <typename __T1, typename __T2>                                      \
-  __DEVICE__                                                                   \
-      typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&    \
-                                   std::numeric_limits<__T2>::is_specialized,  \
-                               __retty>::type                                  \
-      __fn(__T1 __x, __T2 __y) {                                               \
-    return __fn((double)__x, (double)__y);                                     \
-  }
-
-// Define cmath functions with float argument and returns float.
-#define __DEF_FUN1(__retty, __func)                                            \
-  __DEVICE__                                                                   \
-  inline float __func(float __x) { return __func##f(__x); }                    \
-  __HIP_OVERLOAD1(__retty, __func)
-
-// Define cmath functions with float argument and returns __retty.
-#define __DEF_FUNI(__retty, __func)                                            \
-  __DEVICE__                                                                   \
-  inline __retty __func(float __x) { return __func##f(__x); }                  \
-  __HIP_OVERLOAD1(__retty, __func)
-
-// define cmath functions with two float arguments.
-#define __DEF_FUN2(__retty, __func)                                            \
-  __DEVICE__                                                                   \
-  inline float __func(float __x, float __y) { return __func##f(__x, __y); }    \
-  __HIP_OVERLOAD2(__retty, __func)
-
-__DEF_FUN1(double, acos)
-__DEF_FUN1(double, acosh)
-__DEF_FUN1(double, asin)
-__DEF_FUN1(double, asinh)
-__DEF_FUN1(double, atan)
-__DEF_FUN2(double, atan2);
-__DEF_FUN1(double, atanh)
-__DEF_FUN1(double, cbrt)
-__DEF_FUN1(double, ceil)
-__DEF_FUN2(double, copysign);
-__DEF_FUN1(double, cos)
-__DEF_FUN1(double, cosh)
-__DEF_FUN1(double, erf)
-__DEF_FUN1(double, erfc)
-__DEF_FUN1(double, exp)
-__DEF_FUN1(double, exp2)
-__DEF_FUN1(double, expm1)
-__DEF_FUN1(double, fabs)
-__DEF_FUN2(double, fdim);
-__DEF_FUN1(double, floor)
-__DEF_FUN2(double, fmax);
-__DEF_FUN2(double, fmin);
-__DEF_FUN2(double, fmod);
-//__HIP_OVERLOAD1(int, fpclassify)
-__DEF_FUN2(double, hypot);
-__DEF_FUNI(int, ilogb)
-__HIP_OVERLOAD1(bool, isfinite)
-__HIP_OVERLOAD2(bool, isgreater);
-__HIP_OVERLOAD2(bool, isgreaterequal);
-__HIP_OVERLOAD1(bool, isinf);
-__HIP_OVERLOAD2(bool, isless);
-__HIP_OVERLOAD2(bool, islessequal);
-__HIP_OVERLOAD2(bool, islessgreater);
-__HIP_OVERLOAD1(bool, isnan);
-//__HIP_OVERLOAD1(bool, isnormal)
-__HIP_OVERLOAD2(bool, isunordered);
-__DEF_FUN1(double, lgamma)
-__DEF_FUN1(double, log)
-__DEF_FUN1(double, log10)
-__DEF_FUN1(double, log1p)
-__DEF_FUN1(double, log2)
-__DEF_FUN1(double, logb)
-__DEF_FUNI(long long, llrint)
-__DEF_FUNI(long long, llround)
-__DEF_FUNI(long, lrint)
-__DEF_FUNI(long, lround)
-__DEF_FUN1(double, nearbyint);
-__DEF_FUN2(double, nextafter);
-__DEF_FUN2(double, pow);
-__DEF_FUN2(double, remainder);
-__DEF_FUN1(double, rint);
-__DEF_FUN1(double, round);
-__HIP_OVERLOAD1(bool, signbit)
-__DEF_FUN1(double, sin)
-__DEF_FUN1(double, sinh)
-__DEF_FUN1(double, sqrt)
-__DEF_FUN1(double, tan)
-__DEF_FUN1(double, tanh)
-__DEF_FUN1(double, tgamma)
-__DEF_FUN1(double, trunc);
-
-// define cmath functions with a float and an integer argument.
-#define __DEF_FLOAT_FUN2I(__func)                                              \
-  __DEVICE__                                                                   \
-  inline float __func(float __x, int __y) { return __func##f(__x, __y); }
-__DEF_FLOAT_FUN2I(scalbn)
-
-template <class T> __DEVICE__ inline T min(T __arg1, T __arg2) {
-  return (__arg1 < __arg2) ? __arg1 : __arg2;
-}
-
-template <class T> __DEVICE__ inline T max(T __arg1, T __arg2) {
-  return (__arg1 > __arg2) ? __arg1 : __arg2;
-}
-
-__DEVICE__ inline int min(int __arg1, int __arg2) {
-  return (__arg1 < __arg2) ? __arg1 : __arg2;
-}
-__DEVICE__ inline int max(int __arg1, int __arg2) {
-  return (__arg1 > __arg2) ? __arg1 : __arg2;
-}
-
-__DEVICE__
-inline float max(float __x, float __y) { return fmaxf(__x, __y); }
-
-__DEVICE__
-inline double max(double __x, double __y) { return fmax(__x, __y); }
-
-__DEVICE__
-inline float min(float __x, float __y) { return fminf(__x, __y); }
-
-__DEVICE__
-inline double min(double __x, double __y) { return fmin(__x, __y); }
-
-__HIP_OVERLOAD2(double, max)
-__HIP_OVERLOAD2(double, min)
-
-__host__ inline static int min(int __arg1, int __arg2) {
-  return std::min(__arg1, __arg2);
-}
-
-__host__ inline static int max(int __arg1, int __arg2) {
-  return std::max(__arg1, __arg2);
-}
-
-#pragma pop_macro("__DEF_FUN1")
-#pragma pop_macro("__DEF_FUN2")
-#pragma pop_macro("__DEF_FUNI")
-#pragma pop_macro("__DEF_FLOAT_FUN2I")
-#pragma pop_macro("__HIP_OVERLOAD1")
-#pragma pop_macro("__HIP_OVERLOAD2")
-#pragma pop_macro("__DEVICE__")
-#pragma pop_macro("__RETURN_TYPE")
-
-#endif // __CLANG_HIP_MATH_H__
diff --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
deleted file mode 100644
index 8c86649fc9601..0000000000000
--- a/clang/lib/Headers/__clang_hip_runtime_wrapper.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*===---- __clang_hip_runtime_wrapper.h - HIP runtime support ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-/*
- * WARNING: This header is intended to be directly -include'd by
- * the compiler and is not supposed to be included by users.
- *
- */
-
-#ifndef __CLANG_HIP_RUNTIME_WRAPPER_H__
-#define __CLANG_HIP_RUNTIME_WRAPPER_H__
-
-#if __HIP__
-
-#include <cmath>
-#include <cstdlib>
-#include <stdlib.h>
-
-#define __host__ __attribute__((host))
-#define __device__ __attribute__((device))
-#define __global__ __attribute__((global))
-#define __shared__ __attribute__((shared))
-#define __constant__ __attribute__((constant))
-
-#if __HIP_ENABLE_DEVICE_MALLOC__
-extern "C" __device__ void *__hip_malloc(size_t __size);
-extern "C" __device__ void *__hip_free(void *__ptr);
-static inline __device__ void *malloc(size_t __size) {
-  return __hip_malloc(__size);
-}
-static inline __device__ void *free(void *__ptr) { return __hip_free(__ptr); }
-#else
-static inline __device__ void *malloc(size_t __size) {
-  __builtin_trap();
-  return nullptr;
-}
-static inline __device__ void *free(void *__ptr) {
-  __builtin_trap();
-  return nullptr;
-}
-#endif
-
-#include <__clang_hip_libdevice_declares.h>
-#include <__clang_hip_math.h>
-
-#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
-#include <__clang_cuda_math_forward_declares.h>
-#include <__clang_cuda_complex_builtins.h>
-
-#include <algorithm>
-#include <complex>
-#include <new>
-#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
-
-#define __CLANG_HIP_RUNTIME_WRPPER_INCLUDED__ 1
-
-#endif // __HIP__
-#endif // __CLANG_HIP_RUNTIME_WRAPPER_H__
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/hip.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/hip.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/hip.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/hip.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/ockl.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/ockl.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/ockl.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/ockl.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_daz_opt_off.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_daz_opt_off.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_daz_opt_off.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_daz_opt_on.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_daz_opt_on.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_daz_opt_on.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_daz_opt_on.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_finite_only_off.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_finite_only_off.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_finite_only_off.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_finite_only_off.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_finite_only_on.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_finite_only_on.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_finite_only_on.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_finite_only_on.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1010.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1010.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1010.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1010.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1011.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1011.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1011.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1011.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1012.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1012.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_1012.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_1012.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_803.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_803.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_803.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_803.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_900.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_900.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_isa_version_900.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_isa_version_900.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_unsafe_math_off.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_unsafe_math_off.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_unsafe_math_off.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_unsafe_math_on.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_unsafe_math_on.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_unsafe_math_on.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_unsafe_math_on.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_wavefrontsize64_off.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_wavefrontsize64_off.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_wavefrontsize64_off.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_wavefrontsize64_off.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_wavefrontsize64_on.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_wavefrontsize64_on.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/oclc_wavefrontsize64_on.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/oclc_wavefrontsize64_on.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/ocml.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/ocml.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/ocml.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/ocml.bc
diff --git a/clang/test/Driver/Inputs/rocm/amdgcn/bitcode/opencl.bc b/clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/opencl.bc
similarity index 100%
rename from clang/test/Driver/Inputs/rocm/amdgcn/bitcode/opencl.bc
rename to clang/test/Driver/Inputs/rocm-device-libs/amdgcn/bitcode/opencl.bc
diff --git a/clang/test/Driver/Inputs/rocm/include/hip/hip_runtime.h b/clang/test/Driver/Inputs/rocm/include/hip/hip_runtime.h
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip
index 3dd798476e2ba..eaa3f83ae247d 100644
--- a/clang/test/Driver/hip-device-libs.hip
+++ b/clang/test/Driver/hip-device-libs.hip
@@ -8,7 +8,7 @@
 // Test subtarget with flushing on by default.
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:  --cuda-gpu-arch=gfx803 \
-// RUN:  --rocm-path=%S/Inputs/rocm   \
+// RUN:  --rocm-path=%S/Inputs/rocm-device-libs   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -16,7 +16,7 @@
 // Test subtarget with flushing off by ddefault.
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:  --cuda-gpu-arch=gfx900 \
-// RUN:  --rocm-path=%S/Inputs/rocm \
+// RUN:  --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -25,7 +25,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fcuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -34,7 +34,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
 // RUN:   -fno-cuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -43,7 +43,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fno-cuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -52,7 +52,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
 // RUN:   -fcuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -61,7 +61,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
 // RUN:   -fcuda-flush-denormals-to-zero -fno-cuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -69,7 +69,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fcuda-flush-denormals-to-zero -fno-cuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm   \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,NOFLUSHD
 
@@ -77,7 +77,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   -fno-cuda-flush-denormals-to-zero -fcuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm   \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -85,7 +85,7 @@
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
 // RUN:   -fno-cuda-flush-denormals-to-zero -fcuda-flush-denormals-to-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
@@ -93,13 +93,13 @@
 // Test --hip-device-lib-path flag
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx803 \
-// RUN:   --hip-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode   \
+// RUN:   --hip-device-lib-path=%S/Inputs/rocm-device-libs/amdgcn/bitcode   \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,FLUSHD
 
 
 // Test environment variable HIP_DEVICE_LIB_PATH
-// RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm/amdgcn/bitcode \
+// RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm-device-libs/amdgcn/bitcode \
 // RUN:   %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=gfx900 \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
diff --git a/clang/test/Driver/hip-include-path.hip b/clang/test/Driver/hip-include-path.hip
deleted file mode 100644
index 8c0d317936b57..0000000000000
--- a/clang/test/Driver/hip-include-path.hip
+++ /dev/null
@@ -1,33 +0,0 @@
-// REQUIRES: clang-driver
-// REQUIRES: x86-registered-target
-// REQUIRES: amdgpu-registered-target
-// REQUIRES: libgcc
-// UNSUPPORTED: system-windows
-
-// RUN: %clang -c -v -target x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \
-// RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm -nogpulib %s 2>&1 \
-// RUN:   | FileCheck -check-prefixes=COMMON,WRAP,HIP %s
-
-// RUN: %clang -c -v -target x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \
-// RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm -nobuiltininc -nogpulib %s 2>&1 \
-// RUN:   | FileCheck -check-prefixes=COMMON,NOWRAP,HIP %s
-
-// RUN: %clang -c -v -target x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \
-// RUN:   -std=c++11 --rocm-path=%S/Inputs/rocm -nogpuinc -nogpulib %s 2>&1 \
-// RUN:   | FileCheck -check-prefixes=COMMON,WRAP,NOHIP %s
-
-// COMMON: "{{[^"]*}}clang{{[^"]*}}"
-// WRAP: clang/{{.*}}/include/cuda_wrappers
-// NOWRAP-NOT: clang/{{.*}}/include/cuda_wrappers
-// HIP: {{.*}}Inputs/rocm/include
-// NOHIP-NOT: {{.*}}Inputs/rocm/include
-// skip check of standard C++ include path
-// COMMON: clang/{{.*}}/include
-
-// COMMON: "{{[^"]*}}clang{{[^"]*}}"
-// WRAP: clang/{{.*}}/include/cuda_wrappers
-// NOWRAP-NOT: clang/{{.*}}/include/cuda_wrappers
-// HIP: {{.*}}Inputs/rocm/include
-// NOHIP-NOT: {{.*}}Inputs/rocm/include
-// skip check of standard C++ include path
-// COMMON: clang/{{.*}}/include
diff --git a/clang/test/Driver/rocm-detect.cl b/clang/test/Driver/rocm-detect.cl
index 75378bf003bee..b143098c9074c 100644
--- a/clang/test/Driver/rocm-detect.cl
+++ b/clang/test/Driver/rocm-detect.cl
@@ -7,12 +7,12 @@
 // target not included in the test.
 
 // RUN: %clang -### -v -target amdgcn-amd-amdhsa -mcpu=gfx902 \
-// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902-DEFAULTLIBS %s
 
 
 // RUN: %clang -### -v -target amdgcn-amd-amdhsa -mcpu=gfx902 -nogpulib \
-// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902,NODEFAULTLIBS %s
 
 
diff --git a/clang/test/Driver/rocm-detect.hip b/clang/test/Driver/rocm-detect.hip
index 9490ec9ba3762..82ed7138098ad 100644
--- a/clang/test/Driver/rocm-detect.hip
+++ b/clang/test/Driver/rocm-detect.hip
@@ -8,17 +8,17 @@
 // target not included in the test.
 
 // RUN: %clang -### -v -target x86_64-linux-gnu --cuda-gpu-arch=gfx902 \
-// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902-DEFAULTLIBS %s
 
 // Should not interpret -nostdlib as disabling offload libraries.
 // RUN: %clang -### -v -target x86_64-linux-gnu --cuda-gpu-arch=gfx902 -nostdlib \
-// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902-DEFAULTLIBS %s
 
 
 // RUN: %clang -### -v -target x86_64-linux-gnu --cuda-gpu-arch=gfx902 -nogpulib \
-// RUN:   --rocm-path=%S/Inputs/rocm %s 2>&1 \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs %s 2>&1 \
 // RUN:   | FileCheck -check-prefixes=COMMON,GFX902,NODEFAULTLIBS %s
 
 
diff --git a/clang/test/Driver/rocm-device-libs.cl b/clang/test/Driver/rocm-device-libs.cl
index cdb4716bde9a8..23cabd654391d 100644
--- a/clang/test/Driver/rocm-device-libs.cl
+++ b/clang/test/Driver/rocm-device-libs.cl
@@ -6,7 +6,7 @@
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX900-DEFAULT,GFX900,WAVE64 %s
 
@@ -15,7 +15,7 @@
 // Make sure the different denormal default is respected for gfx8
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx803 \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX803-DEFAULT,GFX803,WAVE64 %s
 
@@ -24,7 +24,7 @@
 // Make sure the non-canonical name works
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=fiji \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX803-DEFAULT,GFX803,WAVE64 %s
 
@@ -33,7 +33,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
 // RUN:   -cl-denorms-are-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DAZ,GFX900,WAVE64 %s
 
@@ -41,7 +41,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx803 \
 // RUN:   -cl-denorms-are-zero \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DAZ,GFX803,WAVE64 %s
 
@@ -50,7 +50,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx803 \
 // RUN:   -cl-finite-math-only \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-FINITE-ONLY,GFX803,WAVE64 %s
 
@@ -59,7 +59,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx803                     \
 // RUN:   -cl-fp32-correctly-rounded-divide-sqrt \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-CORRECT-SQRT,GFX803,WAVE64 %s
 
@@ -68,7 +68,7 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx803                     \
 // RUN:   -cl-fast-relaxed-math \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-FAST-RELAXED,GFX803,WAVE64 %s
 
@@ -77,45 +77,45 @@
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx803                     \
 // RUN:   -cl-unsafe-math-optimizations \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-UNSAFE,GFX803,WAVE64 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1010                    \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1010,WAVE32 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1011                    \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1011,WAVE32 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1012                    \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1012,WAVE32 %s
 
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1010 -mwavefrontsize64  \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1010,WAVE64 %s
 
 // RUN: %clang -### -target amdgcn-amd-amdhsa    \
 // RUN:   -x cl -mcpu=gfx1010 -mwavefrontsize64 -mno-wavefrontsize64  \
-// RUN:   --rocm-path=%S/Inputs/rocm \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX1010,WAVE32 %s
 
 // Ignore -mno-wavefrontsize64 without wave32 support
 // RUN: %clang -### -target amdgcn-amd-amdhsa       \
 // RUN:   -x cl -mcpu=gfx803  -mno-wavefrontsize64  \
-// RUN:   --rocm-path=%S/Inputs/rocm    \
+// RUN:   --rocm-path=%S/Inputs/rocm-device-libs    \
 // RUN:   %s \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMMON,GFX803,WAVE64 %s
 
@@ -124,12 +124,12 @@
 // Test --hip-device-lib-path format
 // RUN: %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
-// RUN:   --hip-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode \
+// RUN:   --hip-device-lib-path=%S/Inputs/rocm-device-libs/amdgcn/bitcode \
 // RUN:   %S/opencl.cl \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX900-DEFAULT,GFX900,WAVE64 %s
 
 // Test environment variable HIP_DEVICE_LIB_PATH
-// RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm/amdgcn/bitcode %clang -### -target amdgcn-amd-amdhsa \
+// RUN: env HIP_DEVICE_LIB_PATH=%S/Inputs/rocm-device-libs/amdgcn/bitcode %clang -### -target amdgcn-amd-amdhsa \
 // RUN:   -x cl -mcpu=gfx900 \
 // RUN:   %S/opencl.cl \
 // RUN: 2>&1 | FileCheck -dump-input-on-failure --check-prefixes=COMMON,COMMON-DEFAULT,GFX900-DEFAULT,GFX900,WAVE64 %s
diff --git a/clang/test/Driver/rocm-not-found.cl b/clang/test/Driver/rocm-not-found.cl
index ee931971d9e6a..8ecc4b0ef1055 100644
--- a/clang/test/Driver/rocm-not-found.cl
+++ b/clang/test/Driver/rocm-not-found.cl
@@ -5,7 +5,7 @@
 
 // RUN: %clang -### --sysroot=%s/no-rocm-there -target amdgcn--amdhsa %s 2>&1 | FileCheck %s --check-prefix ERR
 // RUN: %clang -### --rocm-path=%s/no-rocm-there -target amdgcn--amdhsa %s 2>&1 | FileCheck %s --check-prefix ERR
-// ERR: cannot find ROCm installation. Provide its path via --rocm-path, or pass -nogpulib and -nogpuinc to build without ROCm device library and HIP includes.
+// ERR: cannot find ROCm installation. Provide its path via --rocm-path, or pass -nogpulib.
 
 // Accept nogpulib or nostdlib for OpenCL.
 // RUN: %clang -### -nogpulib --rocm-path=%s/no-rocm-there %s 2>&1 | FileCheck %s --check-prefix OK
diff --git a/clang/test/Preprocessor/hip-host-cpu-macros.cu b/clang/test/Preprocessor/hip-host-cpu-macros.cu
index 559541eecb0af..efec439c3656e 100644
--- a/clang/test/Preprocessor/hip-host-cpu-macros.cu
+++ b/clang/test/Preprocessor/hip-host-cpu-macros.cu
@@ -8,6 +8,6 @@ DEVICE __SSE3__
 HOST __SSE3__
 #endif
 
-// RUN: %clang -x hip -E -target x86_64-linux-gnu -msse3 --cuda-gpu-arch=gfx803 -nogpulib -nogpuinc -o - %s 2>&1 | FileCheck %s
+// RUN: %clang -x hip -E -target x86_64-linux-gnu -msse3 --cuda-gpu-arch=gfx803 -nogpulib -o - %s 2>&1 | FileCheck %s
 
 // CHECK-NOT: SSE3

From 1e9a0a4e04aa66f048ba791e25d32ce8c02a20a4 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Mon, 1 Jun 2020 14:35:12 -0700
Subject: [PATCH 10/24] SROA: Remove pointer from visited along with
 instruction

If an instruction is erased we also need to remove it from
Visited set. There is a very small chance that an another
newly created instruction will be created with the same
pointer value in place of an erased one.

Differential Revision: https://reviews.llvm.org/D80958
---
 llvm/lib/Transforms/Scalar/SROA.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 7e92f7b06ecbd..9b3c77eae120b 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3381,6 +3381,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
                             getAdjustedAlignment(&LI, 0), DL);
     Value *V = UndefValue::get(LI.getType());
     Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
+    Visited.erase(&LI);
     LI.replaceAllUsesWith(V);
     LI.eraseFromParent();
     return true;
@@ -3427,6 +3428,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     StoreOpSplitter Splitter(&SI, *U, V->getType(), AATags,
                              getAdjustedAlignment(&SI, 0), DL);
     Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
+    Visited.erase(&SI);
     SI.eraseFromParent();
     return true;
   }
@@ -3473,6 +3475,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
 
     Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse,
                                        Sel->getName() + ".sroa.sel");
+    Visited.erase(&GEPI);
     GEPI.replaceAllUsesWith(NSel);
     GEPI.eraseFromParent();
     Instruction *NSelI = cast<Instruction>(NSel);
@@ -3521,6 +3524,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
       NewPN->addIncoming(NewVal, PHI->getIncomingBlock(I));
     }
 
+    Visited.erase(&GEPI);
     GEPI.replaceAllUsesWith(NewPN);
     GEPI.eraseFromParent();
     Visited.insert(NewPN);

From 7e7ec2b32598083eec0c2a54e04e19f1b7c83594 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 5 Jun 2020 19:48:20 +0000
Subject: [PATCH 11/24] [gn build] Port 8a8c6913a93

---
 llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 9c86dbff22ad4..fc6b6dde04f97 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -73,9 +73,6 @@ copy("Headers") {
     "__clang_cuda_math.h",
     "__clang_cuda_math_forward_declares.h",
     "__clang_cuda_runtime_wrapper.h",
-    "__clang_hip_libdevice_declares.h",
-    "__clang_hip_math.h",
-    "__clang_hip_runtime_wrapper.h",
     "__stddef_max_align_t.h",
     "__wmmintrin_aes.h",
     "__wmmintrin_pclmul.h",

From 672ed5386024ba5cee53e19d637b7920a4889837 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Mon, 1 Jun 2020 17:11:06 -0700
Subject: [PATCH 12/24] [codeview] Put !heapallocsite on calls to operator new

Clang marks calls to operator new as heap allocation sites, but the
operator declared at global scope returns a void pointer. There is no
explicit cast in the code, so the compiler has to write down the
allocated type itself.

Also generalize a cast to use CallBase, so that we mark heap alloc sites
when exceptions are enabled.

Reviewed By: MaskRay

Differential Revision: https://reviews.llvm.org/D80966
---
 clang/lib/CodeGen/CGCall.cpp                  |  2 +-
 clang/lib/CodeGen/CGDebugInfo.cpp             | 12 +++----
 clang/lib/CodeGen/CGDebugInfo.h               |  2 +-
 clang/lib/CodeGen/CGExprCXX.cpp               |  7 ++++
 clang/lib/CodeGen/CGExprScalar.cpp            | 14 +++++---
 .../debug-info-codeview-heapallocsite.c       |  7 ++--
 .../debug-info-codeview-heapallocsite.cpp     | 34 +++++++++++++++++++
 7 files changed, 62 insertions(+), 16 deletions(-)
 create mode 100644 clang/test/CodeGenCXX/debug-info-codeview-heapallocsite.cpp

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 6bde3124555b4..136782fccf409 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4951,7 +4951,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // Add metadata for calls to MSAllocator functions
   if (getDebugInfo() && TargetDecl &&
       TargetDecl->hasAttr<MSAllocatorAttr>())
-    getDebugInfo()->addHeapAllocSiteMetadata(CI, RetTy, Loc);
+    getDebugInfo()->addHeapAllocSiteMetadata(CI, RetTy->getPointeeType(), Loc);
 
   // 4. Finish the call.
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index cc50ec6a8c897..1737154d179a6 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -2146,16 +2146,14 @@ llvm::DIType *CGDebugInfo::getOrCreateStandaloneType(QualType D,
   return T;
 }
 
-void CGDebugInfo::addHeapAllocSiteMetadata(llvm::Instruction *CI,
-                                           QualType D,
+void CGDebugInfo::addHeapAllocSiteMetadata(llvm::CallBase *CI,
+                                           QualType AllocatedTy,
                                            SourceLocation Loc) {
   llvm::MDNode *node;
-  if (D.getTypePtr()->isVoidPointerType()) {
+  if (AllocatedTy->isVoidType())
     node = llvm::MDNode::get(CGM.getLLVMContext(), None);
-  } else {
-    QualType PointeeTy = D.getTypePtr()->getPointeeType();
-    node = getOrCreateType(PointeeTy, getOrCreateFile(Loc));
-  }
+  else
+    node = getOrCreateType(AllocatedTy, getOrCreateFile(Loc));
 
   CI->setMetadata("heapallocsite", node);
 }
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index 367047e79dc9f..96ef6c7c1d27d 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -509,7 +509,7 @@ class CGDebugInfo {
   llvm::DIType *getOrCreateStandaloneType(QualType Ty, SourceLocation Loc);
 
   /// Add heapallocsite metadata for MSAllocator calls.
-  void addHeapAllocSiteMetadata(llvm::Instruction *CallSite, QualType Ty,
+  void addHeapAllocSiteMetadata(llvm::CallBase *CallSite, QualType AllocatedTy,
                                 SourceLocation Loc);
 
   void completeType(const EnumDecl *ED);
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index a68d5109baf81..5bca92470f6fe 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1638,6 +1638,13 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) {
     RValue RV =
       EmitNewDeleteCall(*this, allocator, allocatorType, allocatorArgs);
 
+    // Set !heapallocsite metadata on the call to operator new.
+    if (getDebugInfo()) {
+      if (auto *newCall = dyn_cast<llvm::CallBase>(RV.getScalarVal()))
+        getDebugInfo()->addHeapAllocSiteMetadata(newCall, allocType,
+                                                 E->getExprLoc());
+    }
+
     // If this was a call to a global replaceable allocation function that does
     // not take an alignment argument, the allocator is known to produce
     // storage that's suitably aligned for any object that fits, up to a known
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index b169462f535ad..612a2ecef8430 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2065,11 +2065,15 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
       }
     }
 
-    // Update heapallocsite metadata when there is an explicit cast.
-    if (llvm::CallInst *CI = dyn_cast<llvm::CallInst>(Src))
-      if (CI->getMetadata("heapallocsite") && isa<ExplicitCastExpr>(CE))
-          CGF.getDebugInfo()->
-              addHeapAllocSiteMetadata(CI, CE->getType(), CE->getExprLoc());
+    // Update heapallocsite metadata when there is an explicit pointer cast.
+    if (auto *CI = dyn_cast<llvm::CallBase>(Src)) {
+      if (CI->getMetadata("heapallocsite") && isa<ExplicitCastExpr>(CE)) {
+        QualType PointeeType = DestTy->getPointeeType();
+        if (!PointeeType.isNull())
+          CGF.getDebugInfo()->addHeapAllocSiteMetadata(CI, PointeeType,
+                                                       CE->getExprLoc());
+      }
+    }
 
     return Builder.CreateBitCast(Src, DstTy);
   }
diff --git a/clang/test/CodeGen/debug-info-codeview-heapallocsite.c b/clang/test/CodeGen/debug-info-codeview-heapallocsite.c
index dfc0d19b25e85..25c102b1c37dd 100644
--- a/clang/test/CodeGen/debug-info-codeview-heapallocsite.c
+++ b/clang/test/CodeGen/debug-info-codeview-heapallocsite.c
@@ -1,19 +1,22 @@
-// RUN: %clang_cc1 -triple x86_64-windows-msvc -debug-info-kind=limited -gcodeview -fdeclspec -S -emit-llvm < %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -debug-info-kind=limited -gcodeview -fdeclspec -S -emit-llvm %s -o - | FileCheck %s
 
 struct Foo;
 struct Bar;
 
 __declspec(allocator) void *alloc_void();
+__declspec(allocator) struct Foo *alloc_foo();
 
 void call_alloc() {
   struct Foo *p = alloc_void();
+  struct Foo *w = alloc_foo();
   struct Foo *q = (struct Foo*)alloc_void();
   struct Foo *r = (struct Foo*)(struct Bar*)alloc_void();
 }
 
 // CHECK-LABEL: define {{.*}}void @call_alloc
 // CHECK: call i8* {{.*}}@alloc_void{{.*}} !heapallocsite [[DBG1:!.*]]
-// CHECK: call i8* {{.*}}@alloc_void{{.*}} !heapallocsite [[DBG2:!.*]]
+// CHECK: call %struct.Foo* {{.*}}@alloc_foo{{.*}} !heapallocsite [[DBG2:!.*]]
+// CHECK: call i8* {{.*}}@alloc_void{{.*}} !heapallocsite [[DBG2]]
 // CHECK: call i8* {{.*}}@alloc_void{{.*}} !heapallocsite [[DBG3:!.*]]
 
 // CHECK: [[DBG1]] = !{}
diff --git a/clang/test/CodeGenCXX/debug-info-codeview-heapallocsite.cpp b/clang/test/CodeGenCXX/debug-info-codeview-heapallocsite.cpp
new file mode 100644
index 0000000000000..ab1101c670942
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-codeview-heapallocsite.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -fexceptions -triple x86_64-windows-msvc -debug-info-kind=limited -gcodeview -fdeclspec -S -emit-llvm %s -o - | FileCheck %s
+
+struct Foo {
+  int x;
+};
+struct Bar {
+  int y;
+};
+extern Foo *gv_foo;
+extern Bar *gv_bar;
+extern "C" void doit() {
+  gv_foo = new Foo();
+  gv_bar = new Bar();
+}
+
+// CHECK-LABEL: define {{.*}}void @doit
+// CHECK: call {{.*}} i8* {{.*}}@"??2@YAPEAX_K@Z"(i64 4) {{.*}} !heapallocsite [[DBG_FOO:!.*]]
+// CHECK: call {{.*}} i8* {{.*}}@"??2@YAPEAX_K@Z"(i64 4) {{.*}} !heapallocsite [[DBG_BAR:!.*]]
+
+extern "C" void useinvoke() {
+  struct HasDtor {
+    ~HasDtor() { delete gv_foo; }
+  } o;
+  gv_foo = new Foo();
+}
+
+// CHECK-LABEL: define {{.*}}void @useinvoke
+// CHECK: invoke {{.*}} i8* {{.*}}@"??2@YAPEAX_K@Z"(i64 4)
+// CHECK-NEXT: to label {{.*}} unwind label {{.*}} !heapallocsite [[DBG_FOO]]
+
+// CHECK: [[DBG_FOO]] = distinct !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK-SAME:                                 name: "Foo"
+// CHECK: [[DBG_BAR]] = distinct !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK-SAME:                                 name: "Bar"

From 5d62606f90554751098161b5e99a7bd45a8581ef Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 27 May 2020 13:09:00 -0700
Subject: [PATCH 13/24] AMDGPU/GlobalISel: cmp/select method for extract
 element

Differential Revision: https://reviews.llvm.org/D80749
---
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   85 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.h    |    6 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   35 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |    5 +
 .../AMDGPU/GlobalISel/extractelement.ll       | 1984 ++++++++++-------
 .../regbankselect-extract-vector-elt.mir      | 1344 ++++++-----
 6 files changed, 2174 insertions(+), 1285 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8355eee6a04c3..bec5e0ea08293 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1856,6 +1856,88 @@ static void extendLow32IntoHigh32(MachineIRBuilder &B,
   }
 }
 
+bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
+  MachineInstr &MI, MachineRegisterInfo &MRI,
+  const OperandsMapper &OpdMapper) const {
+
+  Register VecReg = MI.getOperand(1).getReg();
+  Register Idx = MI.getOperand(2).getReg();
+
+  const RegisterBank &IdxBank =
+    *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
+
+  bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
+
+  LLT VecTy = MRI.getType(VecReg);
+  unsigned EltSize = VecTy.getScalarSizeInBits();
+  unsigned NumElem = VecTy.getNumElements();
+
+  if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
+                                                  IsDivergentIdx))
+    return false;
+
+  MachineIRBuilder B(MI);
+  LLT S32 = LLT::scalar(32);
+
+  const RegisterBank &DstBank =
+    *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+  const RegisterBank &SrcBank =
+    *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
+
+  const RegisterBank &CCBank =
+    (DstBank == AMDGPU::SGPRRegBank &&
+     SrcBank == AMDGPU::SGPRRegBank &&
+     IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
+                                     : AMDGPU::VCCRegBank;
+  LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
+
+  if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
+    Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
+    MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
+  }
+
+  LLT EltTy = VecTy.getScalarType();
+  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
+  unsigned NumLanes = DstRegs.size();
+  if (!NumLanes)
+    NumLanes = 1;
+  else
+    EltTy = MRI.getType(DstRegs[0]);
+
+  auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
+  SmallVector<Register, 2> Res(NumLanes);
+  for (unsigned L = 0; L < NumLanes; ++L)
+    Res[L] = UnmergeToEltTy.getReg(L);
+
+  for (unsigned I = 1; I < NumElem; ++I) {
+    auto IC = B.buildConstant(S32, I);
+    MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
+    auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
+    MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
+
+    for (unsigned L = 0; L < NumLanes; ++L) {
+      auto S = B.buildSelect(EltTy, Cmp,
+                             UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
+
+      for (unsigned N : { 0, 2, 3 })
+        MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
+
+      Res[L] = S->getOperand(0).getReg();
+    }
+  }
+
+  for (unsigned L = 0; L < NumLanes; ++L) {
+    Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
+    B.buildCopy(DstReg, Res[L]);
+    MRI.setRegBank(DstReg, DstBank);
+  }
+
+  MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
+  MI.eraseFromParent();
+
+  return true;
+}
+
 void AMDGPURegisterBankInfo::applyMappingImpl(
     const OperandsMapper &OpdMapper) const {
   MachineInstr &MI = OpdMapper.getMI();
@@ -2450,6 +2532,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     LLT DstTy = MRI.getType(DstReg);
     LLT SrcTy = MRI.getType(SrcReg);
 
+    if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
+      return;
+
     MachineIRBuilder B(MI);
 
     const ValueMapping &DstMapping
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 69b2f5e812ecf..79a3b48ae1ce6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -179,6 +179,12 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
 
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
+
+private:
+
+  bool foldExtractEltToCmpSelect(MachineInstr &MI,
+                                 MachineRegisterInfo &MRI,
+                                 const OperandsMapper &OpdMapper) const;
 };
 } // End llvm namespace.
 #endif
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8e56615005af4..4e134e84b9b32 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9623,17 +9623,13 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
 
 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
 // expanded into a set of cmp/select instructions.
-static bool shouldExpandVectorDynExt(SDNode *N) {
-  SDValue Idx = N->getOperand(N->getNumOperands() - 1);
-  if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx))
+bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
+                                                unsigned NumElem,
+                                                bool IsDivergentIdx) {
+  if (UseDivergentRegisterIndexing)
     return false;
 
-  SDValue Vec = N->getOperand(0);
-  EVT VecVT = Vec.getValueType();
-  EVT EltVT = VecVT.getVectorElementType();
-  unsigned VecSize = VecVT.getSizeInBits();
-  unsigned EltSize = EltVT.getSizeInBits();
-  unsigned NumElem = VecVT.getVectorNumElements();
+  unsigned VecSize = EltSize * NumElem;
 
   // Sub-dword vectors of size 2 dword or less have better implementation.
   if (VecSize <= 64 && EltSize < 32)
@@ -9645,7 +9641,7 @@ static bool shouldExpandVectorDynExt(SDNode *N) {
     return true;
 
   // Always do this if var-idx is divergent, otherwise it will become a loop.
-  if (Idx->isDivergent())
+  if (IsDivergentIdx)
     return true;
 
   // Large vectors would yield too many compares and v_cndmask_b32 instructions.
@@ -9654,6 +9650,21 @@ static bool shouldExpandVectorDynExt(SDNode *N) {
   return NumInsts <= 16;
 }
 
+static bool shouldExpandVectorDynExt(SDNode *N) {
+  SDValue Idx = N->getOperand(N->getNumOperands() - 1);
+  if (isa<ConstantSDNode>(Idx))
+    return false;
+
+  SDValue Vec = N->getOperand(0);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  unsigned EltSize = EltVT.getSizeInBits();
+  unsigned NumElem = VecVT.getVectorNumElements();
+
+  return SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
+                                                    Idx->isDivergent());
+}
+
 SDValue SITargetLowering::performExtractVectorEltCombine(
   SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Vec = N->getOperand(0);
@@ -9715,7 +9726,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
   unsigned EltSize = EltVT.getSizeInBits();
 
   // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
-  if (shouldExpandVectorDynExt(N)) {
+  if (::shouldExpandVectorDynExt(N)) {
     SDLoc SL(N);
     SDValue Idx = N->getOperand(1);
     SDValue V;
@@ -9778,7 +9789,7 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
 
   // INSERT_VECTOR_ELT (<n x e>, var-idx)
   // => BUILD_VECTOR n x select (e, const-idx)
-  if (!shouldExpandVectorDynExt(N))
+  if (!::shouldExpandVectorDynExt(N))
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1284eb9155ba5..997075ecd63f0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -203,6 +203,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   /// and not emit a relocation for an LDS global.
   bool shouldUseLDSConstAddress(const GlobalValue *GV) const;
 
+  /// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
+  /// expanded into a set of cmp/select instructions.
+  static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem,
+                                       bool IsDivergentIdx);
+
 private:
   // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
   // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 202ddb0d21a28..4b78c605e0b74 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -6,27 +6,25 @@ define float @dyn_extract_v8f32_const_s_v(i32 %sel) {
 ; GCN-LABEL: dyn_extract_v8f32_const_s_v:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s11, 0x41000000
-; GCN-NEXT:    s_mov_b32 s10, 0x40e00000
-; GCN-NEXT:    s_mov_b32 s9, 0x40c00000
-; GCN-NEXT:    s_mov_b32 s8, 0x40a00000
-; GCN-NEXT:    s_mov_b32 s7, 4.0
-; GCN-NEXT:    s_mov_b32 s6, 0x40400000
-; GCN-NEXT:    s_mov_b32 s5, 2.0
-; GCN-NEXT:    s_mov_b32 s4, 1.0
-; GCN-NEXT:    s_mov_b64 s[12:13], exec
-; GCN-NEXT:  BB0_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s14, v0
-; GCN-NEXT:    s_mov_b32 m0, s14
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s14, v0
-; GCN-NEXT:    s_movrels_b32 s14, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GCN-NEXT:    s_xor_b64 exec, exec, vcc
-; GCN-NEXT:    s_cbranch_execnz BB0_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[12:13]
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 1.0, 2.0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 4.0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x40a00000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v3, 0x40c00000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40e00000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x41000000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v5, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, i32 %sel
@@ -36,16 +34,20 @@ entry:
 define amdgpu_ps float @dyn_extract_v8f32_const_s_s(i32 inreg %sel) {
 ; GCN-LABEL: dyn_extract_v8f32_const_s_s:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s4, 1.0
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    s_mov_b32 s11, 0x41000000
-; GCN-NEXT:    s_mov_b32 s10, 0x40e00000
-; GCN-NEXT:    s_mov_b32 s9, 0x40c00000
-; GCN-NEXT:    s_mov_b32 s8, 0x40a00000
-; GCN-NEXT:    s_mov_b32 s7, 4.0
-; GCN-NEXT:    s_mov_b32 s6, 0x40400000
-; GCN-NEXT:    s_mov_b32 s5, 2.0
-; GCN-NEXT:    s_movrels_b32 s0, s4
+; GCN-NEXT:    s_cmp_eq_u32 s2, 1
+; GCN-NEXT:    s_cselect_b32 s0, 2.0, 1.0
+; GCN-NEXT:    s_cmp_eq_u32 s2, 2
+; GCN-NEXT:    s_cselect_b32 s0, 0x40400000, s0
+; GCN-NEXT:    s_cmp_eq_u32 s2, 3
+; GCN-NEXT:    s_cselect_b32 s0, 4.0, s0
+; GCN-NEXT:    s_cmp_eq_u32 s2, 4
+; GCN-NEXT:    s_cselect_b32 s0, 0x40a00000, s0
+; GCN-NEXT:    s_cmp_eq_u32 s2, 5
+; GCN-NEXT:    s_cselect_b32 s0, 0x40c00000, s0
+; GCN-NEXT:    s_cmp_eq_u32 s2, 6
+; GCN-NEXT:    s_cselect_b32 s0, 0x40e00000, s0
+; GCN-NEXT:    s_cmp_eq_u32 s2, 7
+; GCN-NEXT:    s_cselect_b32 s0, 0x41000000, s0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
@@ -59,24 +61,29 @@ define amdgpu_ps float @dyn_extract_v8f32_s_v(<8 x float> inreg %vec, i32 %sel)
 ; GCN-NEXT:    s_mov_b32 s0, s2
 ; GCN-NEXT:    s_mov_b32 s1, s3
 ; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    s_mov_b32 s3, s5
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s7
-; GCN-NEXT:    s_mov_b32 s6, s8
-; GCN-NEXT:    s_mov_b32 s7, s9
-; GCN-NEXT:    s_mov_b64 s[8:9], exec
-; GCN-NEXT:  BB2_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mov_b32 m0, s10
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s10, v0
-; GCN-NEXT:    s_movrels_b32 s10, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GCN-NEXT:    s_xor_b64 exec, exec, vcc
-; GCN-NEXT:    s_cbranch_execnz BB2_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_mov_b32_e32 v6, s7
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN-NEXT:    v_mov_b32_e32 v7, s8
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT:    v_mov_b32_e32 v8, s9
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v8, vcc
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <8 x float> %vec, i32 %sel
@@ -84,58 +91,47 @@ entry:
 }
 
 define float @dyn_extract_v8f32_v_v(<8 x float> %vec, i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f32_v_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
-; GPRIDX-NEXT:  BB3_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v8
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v8
-; GPRIDX-NEXT:    s_set_gpr_idx_on s6, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v9, v0
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB3_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v9
-; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
-;
-; MOVREL-LABEL: dyn_extract_v8f32_v_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
-; MOVREL-NEXT:  BB3_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s6, v8
-; MOVREL-NEXT:    s_mov_b32 m0, s6
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v8
-; MOVREL-NEXT:    v_movrels_b32_e32 v9, v0
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB3_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v9
-; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: dyn_extract_v8f32_v_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x float> %vec, i32 %sel
   ret float %ext
 }
 
 define amdgpu_ps float @dyn_extract_v8f32_v_s(<8 x float> %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f32_v_s:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v0
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f32_v_s:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 m0, s2
-; MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v8f32_v_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 2
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 5
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <8 x float> %vec, i32 %sel
   ret float %ext
@@ -144,16 +140,20 @@ entry:
 define amdgpu_ps float @dyn_extract_v8f32_s_s(<8 x float> inreg %vec, i32 inreg %sel) {
 ; GCN-LABEL: dyn_extract_v8f32_s_s:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s0, s2
-; GCN-NEXT:    s_mov_b32 m0, s10
-; GCN-NEXT:    s_mov_b32 s1, s3
-; GCN-NEXT:    s_mov_b32 s2, s4
-; GCN-NEXT:    s_mov_b32 s3, s5
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s7
-; GCN-NEXT:    s_mov_b32 s6, s8
-; GCN-NEXT:    s_mov_b32 s7, s9
-; GCN-NEXT:    s_movrels_b32 s0, s0
+; GCN-NEXT:    s_cmp_eq_u32 s10, 1
+; GCN-NEXT:    s_cselect_b32 s0, s3, s2
+; GCN-NEXT:    s_cmp_eq_u32 s10, 2
+; GCN-NEXT:    s_cselect_b32 s0, s4, s0
+; GCN-NEXT:    s_cmp_eq_u32 s10, 3
+; GCN-NEXT:    s_cselect_b32 s0, s5, s0
+; GCN-NEXT:    s_cmp_eq_u32 s10, 4
+; GCN-NEXT:    s_cselect_b32 s0, s6, s0
+; GCN-NEXT:    s_cmp_eq_u32 s10, 5
+; GCN-NEXT:    s_cselect_b32 s0, s7, s0
+; GCN-NEXT:    s_cmp_eq_u32 s10, 6
+; GCN-NEXT:    s_cselect_b32 s0, s8, s0
+; GCN-NEXT:    s_cmp_eq_u32 s10, 7
+; GCN-NEXT:    s_cselect_b32 s0, s9, s0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
@@ -165,28 +165,51 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
 ; GCN-LABEL: dyn_extract_v8i64_const_s_v:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b64 s[18:19], 8
-; GCN-NEXT:    s_mov_b64 s[16:17], 7
-; GCN-NEXT:    s_mov_b64 s[14:15], 6
-; GCN-NEXT:    s_mov_b64 s[12:13], 5
-; GCN-NEXT:    s_mov_b64 s[10:11], 4
-; GCN-NEXT:    s_mov_b64 s[8:9], 3
-; GCN-NEXT:    s_mov_b64 s[6:7], 2
 ; GCN-NEXT:    s_mov_b64 s[4:5], 1
-; GCN-NEXT:    s_mov_b64 s[20:21], exec
-; GCN-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s22, v0
-; GCN-NEXT:    s_lshl_b32 m0, s22, 1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s22, v0
-; GCN-NEXT:    s_movrels_b32 s22, s4
-; GCN-NEXT:    s_movrels_b32 s23, s5
-; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GCN-NEXT:    s_xor_b64 exec, exec, vcc
-; GCN-NEXT:    s_cbranch_execnz BB6_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[20:21]
-; GCN-NEXT:    v_mov_b32_e32 v0, s22
-; GCN-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NEXT:    s_mov_b64 s[6:7], 2
+; GCN-NEXT:    s_mov_b64 s[8:9], 3
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_mov_b32_e32 v4, s7
+; GCN-NEXT:    s_mov_b64 s[10:11], 4
+; GCN-NEXT:    v_mov_b32_e32 v5, s8
+; GCN-NEXT:    v_mov_b32_e32 v6, s9
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:    s_mov_b64 s[12:13], 5
+; GCN-NEXT:    v_mov_b32_e32 v7, s10
+; GCN-NEXT:    v_mov_b32_e32 v8, s11
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT:    s_mov_b64 s[14:15], 6
+; GCN-NEXT:    v_mov_b32_e32 v9, s12
+; GCN-NEXT:    v_mov_b32_e32 v10, s13
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
+; GCN-NEXT:    s_mov_b64 s[16:17], 7
+; GCN-NEXT:    v_mov_b32_e32 v11, s14
+; GCN-NEXT:    v_mov_b32_e32 v12, s15
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GCN-NEXT:    s_mov_b64 s[18:19], 8
+; GCN-NEXT:    v_mov_b32_e32 v13, s16
+; GCN-NEXT:    v_mov_b32_e32 v14, s17
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
+; GCN-NEXT:    v_mov_b32_e32 v15, s18
+; GCN-NEXT:    v_mov_b32_e32 v16, s19
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v14, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v15, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v16, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8>, i32 %sel
@@ -242,31 +265,50 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
 ; GPRIDX-NEXT:    s_mov_b32 s3, s5
 ; GPRIDX-NEXT:    s_mov_b32 s4, s6
 ; GPRIDX-NEXT:    s_mov_b32 s5, s7
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s2
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v4, s3
 ; GPRIDX-NEXT:    s_mov_b32 s6, s8
 ; GPRIDX-NEXT:    s_mov_b32 s7, s9
+; GPRIDX-NEXT:    v_mov_b32_e32 v5, s4
+; GPRIDX-NEXT:    v_mov_b32_e32 v6, s5
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
 ; GPRIDX-NEXT:    s_mov_b32 s8, s10
 ; GPRIDX-NEXT:    s_mov_b32 s9, s11
+; GPRIDX-NEXT:    v_mov_b32_e32 v7, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v8, s7
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
 ; GPRIDX-NEXT:    s_mov_b32 s10, s12
 ; GPRIDX-NEXT:    s_mov_b32 s11, s13
-; GPRIDX-NEXT:    s_mov_b32 s12, s14
-; GPRIDX-NEXT:    s_mov_b32 s13, s15
-; GPRIDX-NEXT:    s_mov_b32 s14, s16
-; GPRIDX-NEXT:    s_mov_b32 s15, s17
-; GPRIDX-NEXT:    s_mov_b64 s[16:17], exec
-; GPRIDX-NEXT:  BB8_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s18, v0
-; GPRIDX-NEXT:    s_lshl_b32 m0, s18, 1
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s18, v0
-; GPRIDX-NEXT:    s_movrels_b32 s18, s0
-; GPRIDX-NEXT:    s_movrels_b32 s19, s1
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, s18
-; GPRIDX-NEXT:    v_mov_b32_e32 v2, s19
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB8_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[16:17]
-; GPRIDX-NEXT:    global_store_dwordx2 v[0:1], v[1:2], off
+; GPRIDX-NEXT:    v_mov_b32_e32 v9, s8
+; GPRIDX-NEXT:    v_mov_b32_e32 v10, s9
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v11, s10
+; GPRIDX-NEXT:    v_mov_b32_e32 v12, s11
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v13, s14
+; GPRIDX-NEXT:    v_mov_b32_e32 v14, s15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
+; GPRIDX-NEXT:    v_mov_b32_e32 v15, s16
+; GPRIDX-NEXT:    v_mov_b32_e32 v16, s17
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v2, v2, v14, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v1, v15, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v2, v16, vcc
+; GPRIDX-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_extract_v8i64_s_v:
@@ -277,31 +319,50 @@ define amdgpu_ps void @dyn_extract_v8i64_s_v(<8 x i64> inreg %vec, i32 %sel) {
 ; MOVREL-NEXT:    s_mov_b32 s3, s5
 ; MOVREL-NEXT:    s_mov_b32 s4, s6
 ; MOVREL-NEXT:    s_mov_b32 s5, s7
+; MOVREL-NEXT:    v_mov_b32_e32 v1, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v2, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v3, s2
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; MOVREL-NEXT:    v_mov_b32_e32 v4, s3
 ; MOVREL-NEXT:    s_mov_b32 s6, s8
 ; MOVREL-NEXT:    s_mov_b32 s7, s9
+; MOVREL-NEXT:    v_mov_b32_e32 v5, s4
+; MOVREL-NEXT:    v_mov_b32_e32 v6, s5
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
 ; MOVREL-NEXT:    s_mov_b32 s8, s10
 ; MOVREL-NEXT:    s_mov_b32 s9, s11
+; MOVREL-NEXT:    v_mov_b32_e32 v7, s6
+; MOVREL-NEXT:    v_mov_b32_e32 v8, s7
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
 ; MOVREL-NEXT:    s_mov_b32 s10, s12
 ; MOVREL-NEXT:    s_mov_b32 s11, s13
-; MOVREL-NEXT:    s_mov_b32 s12, s14
-; MOVREL-NEXT:    s_mov_b32 s13, s15
-; MOVREL-NEXT:    s_mov_b32 s14, s16
-; MOVREL-NEXT:    s_mov_b32 s15, s17
-; MOVREL-NEXT:    s_mov_b64 s[16:17], exec
-; MOVREL-NEXT:  BB8_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s18, v0
-; MOVREL-NEXT:    s_lshl_b32 m0, s18, 1
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s18, v0
-; MOVREL-NEXT:    s_movrels_b32 s18, s0
-; MOVREL-NEXT:    s_movrels_b32 s19, s1
-; MOVREL-NEXT:    v_mov_b32_e32 v1, s18
-; MOVREL-NEXT:    v_mov_b32_e32 v2, s19
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB8_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[16:17]
-; MOVREL-NEXT:    flat_store_dwordx2 v[0:1], v[1:2]
+; MOVREL-NEXT:    v_mov_b32_e32 v9, s8
+; MOVREL-NEXT:    v_mov_b32_e32 v10, s9
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
+; MOVREL-NEXT:    v_mov_b32_e32 v11, s10
+; MOVREL-NEXT:    v_mov_b32_e32 v12, s11
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; MOVREL-NEXT:    v_mov_b32_e32 v13, s14
+; MOVREL-NEXT:    v_mov_b32_e32 v14, s15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
+; MOVREL-NEXT:    v_mov_b32_e32 v15, s16
+; MOVREL-NEXT:    v_mov_b32_e32 v16, s17
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v2, v2, v14, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v1, v15, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v2, v16, vcc
+; MOVREL-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 ; MOVREL-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x i64> %vec, i32 %sel
@@ -310,45 +371,31 @@ entry:
 }
 
 define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v8i64_v_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
-; GPRIDX-NEXT:  BB9_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v16
-; GPRIDX-NEXT:    s_lshl_b32 s7, s6, 1
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v16
-; GPRIDX-NEXT:    s_set_gpr_idx_on s7, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v17, v0
-; GPRIDX-NEXT:    v_mov_b32_e32 v18, v1
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB9_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v17
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, v18
-; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
-;
-; MOVREL-LABEL: dyn_extract_v8i64_v_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
-; MOVREL-NEXT:  BB9_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s6, v16
-; MOVREL-NEXT:    s_lshl_b32 m0, s6, 1
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v16
-; MOVREL-NEXT:    v_movrels_b32_e32 v17, v0
-; MOVREL-NEXT:    v_movrels_b32_e32 v18, v1
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB9_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v17
-; MOVREL-NEXT:    v_mov_b32_e32 v1, v18
-; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: dyn_extract_v8i64_v_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x i64> %vec, i32 %sel
   ret i64 %ext
@@ -437,16 +484,21 @@ entry:
 define amdgpu_ps float @dyn_extract_v8f32_s_s_offset3(<8 x float> inreg %vec, i32 inreg %sel) {
 ; GCN-LABEL: dyn_extract_v8f32_s_s_offset3:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s0, s2
-; GCN-NEXT:    s_mov_b32 s1, s3
-; GCN-NEXT:    s_mov_b32 s3, s5
-; GCN-NEXT:    s_mov_b32 m0, s10
-; GCN-NEXT:    s_mov_b32 s2, s4
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s7
-; GCN-NEXT:    s_mov_b32 s6, s8
-; GCN-NEXT:    s_mov_b32 s7, s9
-; GCN-NEXT:    s_movrels_b32 s0, s3
+; GCN-NEXT:    s_add_i32 s10, s10, 3
+; GCN-NEXT:    s_cmp_eq_u32 s10, 1
+; GCN-NEXT:    s_cselect_b32 s0, s3, s2
+; GCN-NEXT:    s_cmp_eq_u32 s10, 2
+; GCN-NEXT:    s_cselect_b32 s0, s4, s0
+; GCN-NEXT:    s_cmp_eq_u32 s10, 3
+; GCN-NEXT:    s_cselect_b32 s0, s5, s0
+; GCN-NEXT:    s_cmp_eq_u32 s10, 4
+; GCN-NEXT:    s_cselect_b32 s0, s6, s0
+; GCN-NEXT:    s_cmp_eq_u32 s10, 5
+; GCN-NEXT:    s_cselect_b32 s0, s7, s0
+; GCN-NEXT:    s_cmp_eq_u32 s10, 6
+; GCN-NEXT:    s_cselect_b32 s0, s8, s0
+; GCN-NEXT:    s_cmp_eq_u32 s10, 7
+; GCN-NEXT:    s_cselect_b32 s0, s9, s0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
@@ -459,36 +511,41 @@ define float @dyn_extract_v8f32_v_v_offset3(<8 x float> %vec, i32 %sel) {
 ; GPRIDX-LABEL: dyn_extract_v8f32_v_v_offset3:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
-; GPRIDX-NEXT:  BB13_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v8
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v8
-; GPRIDX-NEXT:    s_set_gpr_idx_on s6, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v9, v3
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB13_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v9
+; GPRIDX-NEXT:    v_add_u32_e32 v8, 3, v8
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v8
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v8
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v8
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v8
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v8
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: dyn_extract_v8f32_v_v_offset3:
 ; MOVREL:       ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
-; MOVREL-NEXT:  BB13_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s6, v8
-; MOVREL-NEXT:    s_mov_b32 m0, s6
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v8
-; MOVREL-NEXT:    v_movrels_b32_e32 v9, v3
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB13_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v9
+; MOVREL-NEXT:    v_add_u32_e32 v8, vcc, 3, v8
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v8
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v8
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v8
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v8
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v8
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %add = add i32 %sel, 3
@@ -747,43 +804,55 @@ define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) {
 ; GPRIDX-LABEL: dyn_extract_v8f64_v_v_offset3:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
-; GPRIDX-NEXT:  BB22_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v16
-; GPRIDX-NEXT:    s_add_i32 s7, s6, 3
-; GPRIDX-NEXT:    s_lshl_b32 s7, s7, 1
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v16
-; GPRIDX-NEXT:    s_set_gpr_idx_on s7, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v17, v0
-; GPRIDX-NEXT:    v_mov_b32_e32 v18, v1
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB22_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v17
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, v18
+; GPRIDX-NEXT:    v_add_u32_e32 v16, 3, v16
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: dyn_extract_v8f64_v_v_offset3:
 ; MOVREL:       ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
-; MOVREL-NEXT:  BB22_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s6, v16
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v16
-; MOVREL-NEXT:    s_add_i32 s6, s6, 3
-; MOVREL-NEXT:    s_lshl_b32 m0, s6, 1
-; MOVREL-NEXT:    v_movrels_b32_e32 v17, v0
-; MOVREL-NEXT:    v_movrels_b32_e32 v18, v1
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB22_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v17
-; MOVREL-NEXT:    v_mov_b32_e32 v1, v18
+; MOVREL-NEXT:    v_add_u32_e32 v16, vcc, 3, v16
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %add = add i32 %sel, 3
@@ -792,40 +861,24 @@ entry:
 }
 
 define i8 addrspace(3)* @dyn_extract_v8p3_v_v(<8 x i8 addrspace(3)*> %vec, i32 %idx) {
-; GPRIDX-LABEL: dyn_extract_v8p3_v_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
-; GPRIDX-NEXT:  BB23_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v8
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v8
-; GPRIDX-NEXT:    s_set_gpr_idx_on s6, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v9, v0
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB23_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v9
-; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
-;
-; MOVREL-LABEL: dyn_extract_v8p3_v_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
-; MOVREL-NEXT:  BB23_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s6, v8
-; MOVREL-NEXT:    s_mov_b32 m0, s6
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v8
-; MOVREL-NEXT:    v_movrels_b32_e32 v9, v0
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB23_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v9
-; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: dyn_extract_v8p3_v_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x i8 addrspace(3)*> %vec, i32 %idx
   ret i8 addrspace(3)* %ext
@@ -834,32 +887,40 @@ entry:
 define amdgpu_ps void @dyn_extract_v8p3_s_s(<8 x i8 addrspace(3)*> inreg %vec, i32 inreg %idx) {
 ; GPRIDX-LABEL: dyn_extract_v8p3_s_s:
 ; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_mov_b32 s0, s2
-; GPRIDX-NEXT:    s_mov_b32 m0, s10
-; GPRIDX-NEXT:    s_mov_b32 s1, s3
-; GPRIDX-NEXT:    s_mov_b32 s2, s4
-; GPRIDX-NEXT:    s_mov_b32 s3, s5
-; GPRIDX-NEXT:    s_mov_b32 s4, s6
-; GPRIDX-NEXT:    s_mov_b32 s5, s7
-; GPRIDX-NEXT:    s_mov_b32 s6, s8
-; GPRIDX-NEXT:    s_mov_b32 s7, s9
-; GPRIDX-NEXT:    s_movrels_b32 s0, s0
+; GPRIDX-NEXT:    s_cmp_eq_u32 s10, 1
+; GPRIDX-NEXT:    s_cselect_b32 s0, s3, s2
+; GPRIDX-NEXT:    s_cmp_eq_u32 s10, 2
+; GPRIDX-NEXT:    s_cselect_b32 s0, s4, s0
+; GPRIDX-NEXT:    s_cmp_eq_u32 s10, 3
+; GPRIDX-NEXT:    s_cselect_b32 s0, s5, s0
+; GPRIDX-NEXT:    s_cmp_eq_u32 s10, 4
+; GPRIDX-NEXT:    s_cselect_b32 s0, s6, s0
+; GPRIDX-NEXT:    s_cmp_eq_u32 s10, 5
+; GPRIDX-NEXT:    s_cselect_b32 s0, s7, s0
+; GPRIDX-NEXT:    s_cmp_eq_u32 s10, 6
+; GPRIDX-NEXT:    s_cselect_b32 s0, s8, s0
+; GPRIDX-NEXT:    s_cmp_eq_u32 s10, 7
+; GPRIDX-NEXT:    s_cselect_b32 s0, s9, s0
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
 ; GPRIDX-NEXT:    ds_write_b32 v0, v0
 ; GPRIDX-NEXT:    s_endpgm
 ;
 ; MOVREL-LABEL: dyn_extract_v8p3_s_s:
 ; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 s0, s2
-; MOVREL-NEXT:    s_mov_b32 m0, s10
-; MOVREL-NEXT:    s_mov_b32 s1, s3
-; MOVREL-NEXT:    s_mov_b32 s2, s4
-; MOVREL-NEXT:    s_mov_b32 s3, s5
-; MOVREL-NEXT:    s_mov_b32 s4, s6
-; MOVREL-NEXT:    s_mov_b32 s5, s7
-; MOVREL-NEXT:    s_mov_b32 s6, s8
-; MOVREL-NEXT:    s_mov_b32 s7, s9
-; MOVREL-NEXT:    s_movrels_b32 s0, s0
+; MOVREL-NEXT:    s_cmp_eq_u32 s10, 1
+; MOVREL-NEXT:    s_cselect_b32 s0, s3, s2
+; MOVREL-NEXT:    s_cmp_eq_u32 s10, 2
+; MOVREL-NEXT:    s_cselect_b32 s0, s4, s0
+; MOVREL-NEXT:    s_cmp_eq_u32 s10, 3
+; MOVREL-NEXT:    s_cselect_b32 s0, s5, s0
+; MOVREL-NEXT:    s_cmp_eq_u32 s10, 4
+; MOVREL-NEXT:    s_cselect_b32 s0, s6, s0
+; MOVREL-NEXT:    s_cmp_eq_u32 s10, 5
+; MOVREL-NEXT:    s_cselect_b32 s0, s7, s0
+; MOVREL-NEXT:    s_cmp_eq_u32 s10, 6
+; MOVREL-NEXT:    s_cselect_b32 s0, s8, s0
+; MOVREL-NEXT:    s_cmp_eq_u32 s10, 7
+; MOVREL-NEXT:    s_cselect_b32 s0, s9, s0
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
 ; MOVREL-NEXT:    s_mov_b32 m0, -1
 ; MOVREL-NEXT:    ds_write_b32 v0, v0
@@ -871,45 +932,31 @@ entry:
 }
 
 define i8 addrspace(1)* @dyn_extract_v8p1_v_v(<8 x i8 addrspace(1)*> %vec, i32 %idx) {
-; GPRIDX-LABEL: dyn_extract_v8p1_v_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
-; GPRIDX-NEXT:  BB25_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v16
-; GPRIDX-NEXT:    s_lshl_b32 s7, s6, 1
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v16
-; GPRIDX-NEXT:    s_set_gpr_idx_on s7, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v17, v0
-; GPRIDX-NEXT:    v_mov_b32_e32 v18, v1
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB25_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v17
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, v18
-; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
-;
-; MOVREL-LABEL: dyn_extract_v8p1_v_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
-; MOVREL-NEXT:  BB25_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s6, v16
-; MOVREL-NEXT:    s_lshl_b32 m0, s6, 1
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v16
-; MOVREL-NEXT:    v_movrels_b32_e32 v17, v0
-; MOVREL-NEXT:    v_movrels_b32_e32 v18, v1
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB25_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v17
-; MOVREL-NEXT:    v_mov_b32_e32 v1, v18
-; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: dyn_extract_v8p1_v_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x i8 addrspace(1)*> %vec, i32 %idx
   ret i8 addrspace(1)* %ext
@@ -1149,23 +1196,22 @@ define amdgpu_ps float @dyn_extract_v6f32_s_v(<6 x float> inreg %vec, i32 %sel)
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_mov_b32 s0, s2
 ; GCN-NEXT:    s_mov_b32 s1, s3
-; GCN-NEXT:    s_mov_b32 s2, s4
-; GCN-NEXT:    s_mov_b32 s3, s5
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s7
-; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:  BB33_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    s_mov_b32 m0, s8
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v0
-; GCN-NEXT:    s_movrels_b32 s8, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GCN-NEXT:    s_xor_b64 exec, exec, vcc
-; GCN-NEXT:    s_cbranch_execnz BB33_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_mov_b32_e32 v6, s7
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <6 x float> %vec, i32 %sel
@@ -1173,58 +1219,39 @@ entry:
 }
 
 define float @dyn_extract_v6f32_v_v(<6 x float> %vec, i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v6f32_v_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
-; GPRIDX-NEXT:  BB34_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v6
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v6
-; GPRIDX-NEXT:    s_set_gpr_idx_on s6, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v7, v0
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB34_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v7
-; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
-;
-; MOVREL-LABEL: dyn_extract_v6f32_v_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
-; MOVREL-NEXT:  BB34_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s6, v6
-; MOVREL-NEXT:    s_mov_b32 m0, s6
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v6
-; MOVREL-NEXT:    v_movrels_b32_e32 v7, v0
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB34_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v7
-; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: dyn_extract_v6f32_v_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <6 x float> %vec, i32 %sel
   ret float %ext
 }
 
 define amdgpu_ps float @dyn_extract_v6f32_v_s(<6 x float> %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v6f32_v_s:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v0
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v6f32_v_s:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 m0, s2
-; MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v6f32_v_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 2
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 5
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <6 x float> %vec, i32 %sel
   ret float %ext
@@ -1233,14 +1260,16 @@ entry:
 define amdgpu_ps float @dyn_extract_v6f32_s_s(<6 x float> inreg %vec, i32 inreg %sel) {
 ; GCN-LABEL: dyn_extract_v6f32_s_s:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s0, s2
-; GCN-NEXT:    s_mov_b32 m0, s8
-; GCN-NEXT:    s_mov_b32 s1, s3
-; GCN-NEXT:    s_mov_b32 s2, s4
-; GCN-NEXT:    s_mov_b32 s3, s5
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s7
-; GCN-NEXT:    s_movrels_b32 s0, s0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 1
+; GCN-NEXT:    s_cselect_b32 s0, s3, s2
+; GCN-NEXT:    s_cmp_eq_u32 s8, 2
+; GCN-NEXT:    s_cselect_b32 s0, s4, s0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 3
+; GCN-NEXT:    s_cselect_b32 s0, s5, s0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 4
+; GCN-NEXT:    s_cselect_b32 s0, s6, s0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 5
+; GCN-NEXT:    s_cselect_b32 s0, s7, s0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
@@ -1254,23 +1283,25 @@ define amdgpu_ps float @dyn_extract_v7f32_s_v(<7 x float> inreg %vec, i32 %sel)
 ; GCN-NEXT:    s_mov_b32 s0, s2
 ; GCN-NEXT:    s_mov_b32 s1, s3
 ; GCN-NEXT:    s_mov_b32 s2, s4
-; GCN-NEXT:    s_mov_b32 s3, s5
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s6, s8
-; GCN-NEXT:    s_mov_b32 s5, s7
-; GCN-NEXT:    s_mov_b64 s[8:9], exec
-; GCN-NEXT:  BB37_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s7, v0
-; GCN-NEXT:    s_mov_b32 m0, s7
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v0
-; GCN-NEXT:    s_movrels_b32 s7, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GCN-NEXT:    s_xor_b64 exec, exec, vcc
-; GCN-NEXT:    s_cbranch_execnz BB37_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, s5
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_mov_b32_e32 v6, s7
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN-NEXT:    v_mov_b32_e32 v7, s8
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v7, vcc
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <7 x float> %vec, i32 %sel
@@ -1278,58 +1309,43 @@ entry:
 }
 
 define float @dyn_extract_v7f32_v_v(<7 x float> %vec, i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v7f32_v_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
-; GPRIDX-NEXT:  BB38_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v7
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
-; GPRIDX-NEXT:    s_set_gpr_idx_on s6, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v8, v0
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB38_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v8
-; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
-;
-; MOVREL-LABEL: dyn_extract_v7f32_v_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
-; MOVREL-NEXT:  BB38_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s6, v7
-; MOVREL-NEXT:    s_mov_b32 m0, s6
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
-; MOVREL-NEXT:    v_movrels_b32_e32 v8, v0
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB38_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v8
-; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: dyn_extract_v7f32_v_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <7 x float> %vec, i32 %sel
   ret float %ext
 }
 
 define amdgpu_ps float @dyn_extract_v7f32_v_s(<7 x float> %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v7f32_v_s:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v0
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v7f32_v_s:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_mov_b32 m0, s2
-; MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
-; MOVREL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v7f32_v_s:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 2
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 5
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <7 x float> %vec, i32 %sel
   ret float %ext
@@ -1338,15 +1354,18 @@ entry:
 define amdgpu_ps float @dyn_extract_v7f32_s_s(<7 x float> inreg %vec, i32 inreg %sel) {
 ; GCN-LABEL: dyn_extract_v7f32_s_s:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s0, s2
-; GCN-NEXT:    s_mov_b32 m0, s9
-; GCN-NEXT:    s_mov_b32 s1, s3
-; GCN-NEXT:    s_mov_b32 s2, s4
-; GCN-NEXT:    s_mov_b32 s3, s5
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s7
-; GCN-NEXT:    s_mov_b32 s6, s8
-; GCN-NEXT:    s_movrels_b32 s0, s0
+; GCN-NEXT:    s_cmp_eq_u32 s9, 1
+; GCN-NEXT:    s_cselect_b32 s0, s3, s2
+; GCN-NEXT:    s_cmp_eq_u32 s9, 2
+; GCN-NEXT:    s_cselect_b32 s0, s4, s0
+; GCN-NEXT:    s_cmp_eq_u32 s9, 3
+; GCN-NEXT:    s_cselect_b32 s0, s5, s0
+; GCN-NEXT:    s_cmp_eq_u32 s9, 4
+; GCN-NEXT:    s_cselect_b32 s0, s6, s0
+; GCN-NEXT:    s_cmp_eq_u32 s9, 5
+; GCN-NEXT:    s_cselect_b32 s0, s7, s0
+; GCN-NEXT:    s_cmp_eq_u32 s9, 6
+; GCN-NEXT:    s_cselect_b32 s0, s8, s0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
@@ -1357,30 +1376,43 @@ entry:
 define amdgpu_ps double @dyn_extract_v6f64_s_v(<6 x double> inreg %vec, i32 %sel) {
 ; GCN-LABEL: dyn_extract_v6f64_s_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s16, s2
-; GCN-NEXT:    s_mov_b32 s17, s3
-; GCN-NEXT:    s_mov_b32 s18, s4
-; GCN-NEXT:    s_mov_b32 s19, s5
-; GCN-NEXT:    s_mov_b32 s20, s6
-; GCN-NEXT:    s_mov_b32 s21, s7
-; GCN-NEXT:    s_mov_b32 s22, s8
-; GCN-NEXT:    s_mov_b32 s23, s9
-; GCN-NEXT:    s_mov_b32 s24, s10
-; GCN-NEXT:    s_mov_b32 s25, s11
-; GCN-NEXT:    s_mov_b32 s26, s12
-; GCN-NEXT:    s_mov_b32 s27, s13
-; GCN-NEXT:    s_mov_b64 s[2:3], exec
-; GCN-NEXT:  BB41_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-NEXT:    v_mov_b32_e32 v6, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v7, s6
+; GCN-NEXT:    v_mov_b32_e32 v8, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v9, s10
+; GCN-NEXT:    v_mov_b32_e32 v10, s11
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
+; GCN-NEXT:    v_mov_b32_e32 v11, s12
+; GCN-NEXT:    v_mov_b32_e32 v12, s13
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v11, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v12, vcc
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_lshl_b32 m0, s0, 1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; GCN-NEXT:    s_movrels_b32 s0, s16
-; GCN-NEXT:    s_movrels_b32 s1, s17
-; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GCN-NEXT:    s_xor_b64 exec, exec, vcc
-; GCN-NEXT:    s_cbranch_execnz BB41_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[2:3]
+; GCN-NEXT:    v_readfirstlane_b32 s1, v1
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <6 x double> %vec, i32 %sel
@@ -1388,45 +1420,25 @@ entry:
 }
 
 define double @dyn_extract_v6f64_v_v(<6 x double> %vec, i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v6f64_v_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
-; GPRIDX-NEXT:  BB42_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v12
-; GPRIDX-NEXT:    s_lshl_b32 s7, s6, 1
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v12
-; GPRIDX-NEXT:    s_set_gpr_idx_on s7, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v13, v0
-; GPRIDX-NEXT:    v_mov_b32_e32 v14, v1
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB42_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v13
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, v14
-; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
-;
-; MOVREL-LABEL: dyn_extract_v6f64_v_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
-; MOVREL-NEXT:  BB42_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s6, v12
-; MOVREL-NEXT:    s_lshl_b32 m0, s6, 1
-; MOVREL-NEXT:    v_movrels_b32_e32 v13, v0
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v12
-; MOVREL-NEXT:    v_movrels_b32_e32 v14, v1
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB42_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v13
-; MOVREL-NEXT:    v_mov_b32_e32 v1, v14
-; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: dyn_extract_v6f64_v_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v12
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v12
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v12
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <6 x double> %vec, i32 %sel
   ret double %ext
@@ -1483,32 +1495,50 @@ entry:
 define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel) {
 ; GCN-LABEL: dyn_extract_v7f64_s_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_mov_b32 s16, s2
-; GCN-NEXT:    s_mov_b32 s17, s3
-; GCN-NEXT:    s_mov_b32 s18, s4
-; GCN-NEXT:    s_mov_b32 s19, s5
-; GCN-NEXT:    s_mov_b32 s20, s6
-; GCN-NEXT:    s_mov_b32 s21, s7
-; GCN-NEXT:    s_mov_b32 s22, s8
-; GCN-NEXT:    s_mov_b32 s23, s9
-; GCN-NEXT:    s_mov_b32 s24, s10
-; GCN-NEXT:    s_mov_b32 s25, s11
-; GCN-NEXT:    s_mov_b32 s26, s12
-; GCN-NEXT:    s_mov_b32 s27, s13
-; GCN-NEXT:    s_mov_b32 s28, s14
-; GCN-NEXT:    s_mov_b32 s29, s15
-; GCN-NEXT:    s_mov_b64 s[2:3], exec
-; GCN-NEXT:  BB45_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_mov_b32 s0, s2
+; GCN-NEXT:    s_mov_b32 s1, s3
+; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-NEXT:    v_mov_b32_e32 v6, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    v_mov_b32_e32 v7, s6
+; GCN-NEXT:    v_mov_b32_e32 v8, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v9, s8
+; GCN-NEXT:    v_mov_b32_e32 v10, s9
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
+; GCN-NEXT:    v_mov_b32_e32 v11, s12
+; GCN-NEXT:    v_mov_b32_e32 v12, s13
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GCN-NEXT:    v_mov_b32_e32 v13, s14
+; GCN-NEXT:    v_mov_b32_e32 v14, s15
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v12, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v13, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v14, vcc
 ; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_lshl_b32 m0, s0, 1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; GCN-NEXT:    s_movrels_b32 s0, s16
-; GCN-NEXT:    s_movrels_b32 s1, s17
-; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GCN-NEXT:    s_xor_b64 exec, exec, vcc
-; GCN-NEXT:    s_cbranch_execnz BB45_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[2:3]
+; GCN-NEXT:    v_readfirstlane_b32 s1, v1
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <7 x double> %vec, i32 %sel
@@ -1516,45 +1546,28 @@ entry:
 }
 
 define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v7f64_v_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
-; GPRIDX-NEXT:  BB46_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v14
-; GPRIDX-NEXT:    s_lshl_b32 s7, s6, 1
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v14
-; GPRIDX-NEXT:    s_set_gpr_idx_on s7, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v15, v0
-; GPRIDX-NEXT:    v_mov_b32_e32 v16, v1
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB46_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v15
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, v16
-; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
-;
-; MOVREL-LABEL: dyn_extract_v7f64_v_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
-; MOVREL-NEXT:  BB46_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s6, v14
-; MOVREL-NEXT:    s_lshl_b32 m0, s6, 1
-; MOVREL-NEXT:    v_movrels_b32_e32 v15, v0
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v14
-; MOVREL-NEXT:    v_movrels_b32_e32 v16, v1
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB46_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v15
-; MOVREL-NEXT:    v_mov_b32_e32 v1, v16
-; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: dyn_extract_v7f64_v_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v14
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v14
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v14
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v14
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v14
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <7 x double> %vec, i32 %sel
   ret double %ext
@@ -1622,7 +1635,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 2
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -1665,7 +1678,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 24
+; GPRIDX-NEXT:     wavefront_sgpr_count = 9
 ; GPRIDX-NEXT:     workitem_vgpr_count = 4
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -1681,22 +1694,25 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
 ; GPRIDX-NEXT:     runtime_loader_kernel_symbol = 0
 ; GPRIDX-NEXT:    .end_amd_kernel_code_t
 ; GPRIDX-NEXT:  ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GPRIDX-NEXT:    s_load_dword s2, s[4:5], 0x8
-; GPRIDX-NEXT:    s_mov_b32 s16, 0
-; GPRIDX-NEXT:    s_mov_b64 s[8:9], 1.0
-; GPRIDX-NEXT:    s_mov_b32 s17, 0x40140000
-; GPRIDX-NEXT:    s_mov_b64 s[14:15], 4.0
+; GPRIDX-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GPRIDX-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GPRIDX-NEXT:    s_mov_b32 s0, 0
+; GPRIDX-NEXT:    s_mov_b32 s3, 0x40080000
+; GPRIDX-NEXT:    s_mov_b32 s2, s0
+; GPRIDX-NEXT:    s_mov_b32 s1, 0x40140000
 ; GPRIDX-NEXT:    s_waitcnt lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b32 m0, s2
-; GPRIDX-NEXT:    s_mov_b32 s13, 0x40080000
-; GPRIDX-NEXT:    s_mov_b32 s12, s16
-; GPRIDX-NEXT:    s_mov_b64 s[10:11], 2.0
-; GPRIDX-NEXT:    s_movrels_b64 s[2:3], s[8:9]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, s2
-; GPRIDX-NEXT:    v_mov_b32_e32 v3, s1
-; GPRIDX-NEXT:    v_mov_b32_e32 v1, s3
-; GPRIDX-NEXT:    v_mov_b32_e32 v2, s0
+; GPRIDX-NEXT:    s_cmp_eq_u32 s8, 1
+; GPRIDX-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
+; GPRIDX-NEXT:    s_cmp_eq_u32 s8, 2
+; GPRIDX-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GPRIDX-NEXT:    s_cmp_eq_u32 s8, 3
+; GPRIDX-NEXT:    s_cselect_b64 s[2:3], 4.0, s[2:3]
+; GPRIDX-NEXT:    s_cmp_eq_u32 s8, 4
+; GPRIDX-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s6
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s7
 ; GPRIDX-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GPRIDX-NEXT:    s_endpgm
 ;
@@ -1711,7 +1727,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 1
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -1754,7 +1770,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 24
+; MOVREL-NEXT:     wavefront_sgpr_count = 9
 ; MOVREL-NEXT:     workitem_vgpr_count = 4
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -1770,22 +1786,25 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
 ; MOVREL-NEXT:     runtime_loader_kernel_symbol = 0
 ; MOVREL-NEXT:    .end_amd_kernel_code_t
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; MOVREL-NEXT:    s_load_dword s2, s[4:5], 0x8
-; MOVREL-NEXT:    s_mov_b32 s16, 0
-; MOVREL-NEXT:    s_mov_b64 s[8:9], 1.0
-; MOVREL-NEXT:    s_mov_b32 s17, 0x40140000
-; MOVREL-NEXT:    s_mov_b64 s[14:15], 4.0
+; MOVREL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; MOVREL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; MOVREL-NEXT:    s_mov_b32 s0, 0
+; MOVREL-NEXT:    s_mov_b32 s3, 0x40080000
+; MOVREL-NEXT:    s_mov_b32 s2, s0
+; MOVREL-NEXT:    s_mov_b32 s1, 0x40140000
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b32 m0, s2
-; MOVREL-NEXT:    s_mov_b32 s13, 0x40080000
-; MOVREL-NEXT:    s_mov_b32 s12, s16
-; MOVREL-NEXT:    s_mov_b64 s[10:11], 2.0
-; MOVREL-NEXT:    s_movrels_b64 s[2:3], s[8:9]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, s2
-; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
-; MOVREL-NEXT:    v_mov_b32_e32 v1, s3
-; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
+; MOVREL-NEXT:    s_cmp_eq_u32 s8, 1
+; MOVREL-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
+; MOVREL-NEXT:    s_cmp_eq_u32 s8, 2
+; MOVREL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; MOVREL-NEXT:    s_cmp_eq_u32 s8, 3
+; MOVREL-NEXT:    s_cselect_b64 s[2:3], 4.0, s[2:3]
+; MOVREL-NEXT:    s_cmp_eq_u32 s8, 4
+; MOVREL-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
+; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v2, s6
+; MOVREL-NEXT:    v_mov_b32_e32 v1, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v3, s7
 ; MOVREL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; MOVREL-NEXT:    s_endpgm
 entry:
@@ -1798,34 +1817,46 @@ define float @dyn_extract_v15f32_const_s_v(i32 %sel) {
 ; GCN-LABEL: dyn_extract_v15f32_const_s_v:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s18, 0x41700000
-; GCN-NEXT:    s_mov_b32 s17, 0x41600000
-; GCN-NEXT:    s_mov_b32 s16, 0x41500000
-; GCN-NEXT:    s_mov_b32 s15, 0x41400000
-; GCN-NEXT:    s_mov_b32 s14, 0x41300000
-; GCN-NEXT:    s_mov_b32 s13, 0x41200000
-; GCN-NEXT:    s_mov_b32 s12, 0x41100000
-; GCN-NEXT:    s_mov_b32 s11, 0x41000000
-; GCN-NEXT:    s_mov_b32 s10, 0x40e00000
-; GCN-NEXT:    s_mov_b32 s9, 0x40c00000
-; GCN-NEXT:    s_mov_b32 s8, 0x40a00000
-; GCN-NEXT:    s_mov_b32 s7, 4.0
-; GCN-NEXT:    s_mov_b32 s6, 0x40400000
-; GCN-NEXT:    s_mov_b32 s5, 2.0
-; GCN-NEXT:    s_mov_b32 s4, 1.0
-; GCN-NEXT:    s_mov_b64 s[20:21], exec
-; GCN-NEXT:  BB50_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s19, v0
-; GCN-NEXT:    s_mov_b32 m0, s19
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s19, v0
-; GCN-NEXT:    s_movrels_b32 s19, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GCN-NEXT:    s_xor_b64 exec, exec, vcc
-; GCN-NEXT:    s_cbranch_execnz BB50_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[20:21]
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v13, 1.0, 2.0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 4.0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x40a00000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v3, 0x40c00000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40e00000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x41000000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_mov_b32_e32 v6, 0x41100000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN-NEXT:    v_mov_b32_e32 v7, 0x41200000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT:    v_mov_b32_e32 v8, 0x41300000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; GCN-NEXT:    v_mov_b32_e32 v9, 0x41400000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GCN-NEXT:    v_mov_b32_e32 v10, 0x41500000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; GCN-NEXT:    v_mov_b32_e32 v11, 0x41600000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GCN-NEXT:    v_mov_b32_e32 v12, 0x41700000
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v12, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <15 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>, i32 %sel
@@ -1865,31 +1896,57 @@ define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel
 ; GCN-NEXT:    s_mov_b32 s0, s2
 ; GCN-NEXT:    s_mov_b32 s1, s3
 ; GCN-NEXT:    s_mov_b32 s2, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    s_mov_b32 s3, s5
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
 ; GCN-NEXT:    s_mov_b32 s5, s7
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v0
 ; GCN-NEXT:    s_mov_b32 s6, s8
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_mov_b32_e32 v6, s5
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
 ; GCN-NEXT:    s_mov_b32 s7, s9
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN-NEXT:    v_mov_b32_e32 v7, s6
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
 ; GCN-NEXT:    s_mov_b32 s8, s10
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GCN-NEXT:    v_mov_b32_e32 v8, s7
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
 ; GCN-NEXT:    s_mov_b32 s9, s11
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; GCN-NEXT:    v_mov_b32_e32 v9, s8
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v0
 ; GCN-NEXT:    s_mov_b32 s10, s12
-; GCN-NEXT:    s_mov_b32 s11, s13
-; GCN-NEXT:    s_mov_b32 s12, s14
-; GCN-NEXT:    s_mov_b32 s14, s16
-; GCN-NEXT:    s_mov_b32 s13, s15
-; GCN-NEXT:    s_mov_b64 s[16:17], exec
-; GCN-NEXT:  BB52_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mov_b32 m0, s15
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v0
-; GCN-NEXT:    s_movrels_b32 s15, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s15
-; GCN-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GCN-NEXT:    s_xor_b64 exec, exec, vcc
-; GCN-NEXT:    s_cbranch_execnz BB52_1
-; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GCN-NEXT:    v_mov_b32_e32 v10, s9
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; GCN-NEXT:    v_mov_b32_e32 v11, s10
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GCN-NEXT:    v_mov_b32_e32 v12, s13
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
+; GCN-NEXT:    v_mov_b32_e32 v13, s14
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v13, vcc
+; GCN-NEXT:    v_mov_b32_e32 v14, s15
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
+; GCN-NEXT:    v_mov_b32_e32 v15, s16
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v15, vcc
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %ext = extractelement <15 x float> %vec, i32 %sel
@@ -1897,40 +1954,38 @@ entry:
 }
 
 define float @dyn_extract_v15f32_v_v(<15 x float> %vec, i32 %sel) {
-; GPRIDX-LABEL: dyn_extract_v15f32_v_v:
-; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
-; GPRIDX-NEXT:  BB53_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v15
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v15
-; GPRIDX-NEXT:    s_set_gpr_idx_on s6, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v16, v0
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB53_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v16
-; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
-;
-; MOVREL-LABEL: dyn_extract_v15f32_v_v:
-; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
-; MOVREL-NEXT:  BB53_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s6, v15
-; MOVREL-NEXT:    s_mov_b32 m0, s6
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v15
-; MOVREL-NEXT:    v_movrels_b32_e32 v16, v0
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB53_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v16
-; MOVREL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: dyn_extract_v15f32_v_v:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <15 x float> %vec, i32 %sel
   ret float %ext
@@ -2013,39 +2068,430 @@ define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) {
 ; GPRIDX-LABEL: dyn_extract_v15f32_v_v_offset3:
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], exec
-; GPRIDX-NEXT:  BB57_1: ; =>This Inner Loop Header: Depth=1
-; GPRIDX-NEXT:    v_readfirstlane_b32 s6, v15
-; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v15
-; GPRIDX-NEXT:    s_set_gpr_idx_on s6, gpr_idx(SRC0)
-; GPRIDX-NEXT:    v_mov_b32_e32 v16, v3
-; GPRIDX-NEXT:    s_set_gpr_idx_off
-; GPRIDX-NEXT:    s_and_saveexec_b64 vcc, vcc
-; GPRIDX-NEXT:    s_xor_b64 exec, exec, vcc
-; GPRIDX-NEXT:    s_cbranch_execnz BB57_1
-; GPRIDX-NEXT:  ; %bb.2:
-; GPRIDX-NEXT:    s_mov_b64 exec, s[4:5]
-; GPRIDX-NEXT:    v_mov_b32_e32 v0, v16
+; GPRIDX-NEXT:    v_add_u32_e32 v15, 3, v15
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
+; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v15
+; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: dyn_extract_v15f32_v_v_offset3:
 ; MOVREL:       ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], exec
-; MOVREL-NEXT:  BB57_1: ; =>This Inner Loop Header: Depth=1
-; MOVREL-NEXT:    v_readfirstlane_b32 s6, v15
-; MOVREL-NEXT:    s_mov_b32 m0, s6
-; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v15
-; MOVREL-NEXT:    v_movrels_b32_e32 v16, v3
-; MOVREL-NEXT:    s_and_saveexec_b64 vcc, vcc
-; MOVREL-NEXT:    s_xor_b64 exec, exec, vcc
-; MOVREL-NEXT:    s_cbranch_execnz BB57_1
-; MOVREL-NEXT:  ; %bb.2:
-; MOVREL-NEXT:    s_mov_b64 exec, s[4:5]
-; MOVREL-NEXT:    v_mov_b32_e32 v0, v16
+; MOVREL-NEXT:    v_add_u32_e32 v15, vcc, 3, v15
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 8, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
+; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v15
+; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %add = add i32 %sel, 3
   %ext = extractelement <15 x float> %vec, i32 %add
   ret float %ext
 }
+
+define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v4f32_s_s_s:
+; GPRIDX:         .amd_kernel_code_t
+; GPRIDX-NEXT:     amd_code_version_major = 1
+; GPRIDX-NEXT:     amd_code_version_minor = 2
+; GPRIDX-NEXT:     amd_machine_kind = 1
+; GPRIDX-NEXT:     amd_machine_version_major = 9
+; GPRIDX-NEXT:     amd_machine_version_minor = 0
+; GPRIDX-NEXT:     amd_machine_version_stepping = 0
+; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
+; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
+; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 0
+; GPRIDX-NEXT:     priority = 0
+; GPRIDX-NEXT:     float_mode = 240
+; GPRIDX-NEXT:     priv = 0
+; GPRIDX-NEXT:     enable_dx10_clamp = 1
+; GPRIDX-NEXT:     debug_mode = 0
+; GPRIDX-NEXT:     enable_ieee_mode = 1
+; GPRIDX-NEXT:     enable_wgp_mode = 0
+; GPRIDX-NEXT:     enable_mem_ordered = 0
+; GPRIDX-NEXT:     enable_fwd_progress = 0
+; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GPRIDX-NEXT:     user_sgpr_count = 6
+; GPRIDX-NEXT:     enable_trap_handler = 0
+; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 0
+; GPRIDX-NEXT:     enable_sgpr_workgroup_id_z = 0
+; GPRIDX-NEXT:     enable_sgpr_workgroup_info = 0
+; GPRIDX-NEXT:     enable_vgpr_workitem_id = 0
+; GPRIDX-NEXT:     enable_exception_msb = 0
+; GPRIDX-NEXT:     granulated_lds_size = 0
+; GPRIDX-NEXT:     enable_exception = 0
+; GPRIDX-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GPRIDX-NEXT:     enable_sgpr_dispatch_ptr = 0
+; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
+; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 0
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
+; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GPRIDX-NEXT:     enable_wavefront_size32 = 0
+; GPRIDX-NEXT:     enable_ordered_append_gds = 0
+; GPRIDX-NEXT:     private_element_size = 1
+; GPRIDX-NEXT:     is_ptr64 = 1
+; GPRIDX-NEXT:     is_dynamic_callstack = 0
+; GPRIDX-NEXT:     is_debug_enabled = 0
+; GPRIDX-NEXT:     is_xnack_enabled = 0
+; GPRIDX-NEXT:     workitem_private_segment_byte_size = 0
+; GPRIDX-NEXT:     workgroup_group_segment_byte_size = 0
+; GPRIDX-NEXT:     gds_segment_byte_size = 0
+; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
+; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
+; GPRIDX-NEXT:     wavefront_sgpr_count = 6
+; GPRIDX-NEXT:     workitem_vgpr_count = 3
+; GPRIDX-NEXT:     reserved_vgpr_first = 0
+; GPRIDX-NEXT:     reserved_vgpr_count = 0
+; GPRIDX-NEXT:     reserved_sgpr_first = 0
+; GPRIDX-NEXT:     reserved_sgpr_count = 0
+; GPRIDX-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GPRIDX-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GPRIDX-NEXT:     kernarg_segment_alignment = 4
+; GPRIDX-NEXT:     group_segment_alignment = 4
+; GPRIDX-NEXT:     private_segment_alignment = 4
+; GPRIDX-NEXT:     wavefront_size = 6
+; GPRIDX-NEXT:     call_convention = -1
+; GPRIDX-NEXT:     runtime_loader_kernel_symbol = 0
+; GPRIDX-NEXT:    .end_amd_kernel_code_t
+; GPRIDX-NEXT:  ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GPRIDX-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GPRIDX-NEXT:    s_waitcnt lgkmcnt(0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT:    s_cmp_eq_u32 s2, 1
+; GPRIDX-NEXT:    s_cselect_b32 s3, 2.0, 1.0
+; GPRIDX-NEXT:    s_cmp_eq_u32 s2, 2
+; GPRIDX-NEXT:    s_cselect_b32 s3, 0x40400000, s3
+; GPRIDX-NEXT:    s_cmp_eq_u32 s2, 3
+; GPRIDX-NEXT:    s_cselect_b32 s2, 4.0, s3
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT:    global_store_dword v[0:1], v2, off
+; GPRIDX-NEXT:    s_endpgm
+;
+; MOVREL-LABEL: dyn_extract_v4f32_s_s_s:
+; MOVREL:         .amd_kernel_code_t
+; MOVREL-NEXT:     amd_code_version_major = 1
+; MOVREL-NEXT:     amd_code_version_minor = 2
+; MOVREL-NEXT:     amd_machine_kind = 1
+; MOVREL-NEXT:     amd_machine_version_major = 8
+; MOVREL-NEXT:     amd_machine_version_minor = 0
+; MOVREL-NEXT:     amd_machine_version_stepping = 3
+; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
+; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
+; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 0
+; MOVREL-NEXT:     priority = 0
+; MOVREL-NEXT:     float_mode = 240
+; MOVREL-NEXT:     priv = 0
+; MOVREL-NEXT:     enable_dx10_clamp = 1
+; MOVREL-NEXT:     debug_mode = 0
+; MOVREL-NEXT:     enable_ieee_mode = 1
+; MOVREL-NEXT:     enable_wgp_mode = 0
+; MOVREL-NEXT:     enable_mem_ordered = 0
+; MOVREL-NEXT:     enable_fwd_progress = 0
+; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; MOVREL-NEXT:     user_sgpr_count = 6
+; MOVREL-NEXT:     enable_trap_handler = 0
+; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
+; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 0
+; MOVREL-NEXT:     enable_sgpr_workgroup_id_z = 0
+; MOVREL-NEXT:     enable_sgpr_workgroup_info = 0
+; MOVREL-NEXT:     enable_vgpr_workitem_id = 0
+; MOVREL-NEXT:     enable_exception_msb = 0
+; MOVREL-NEXT:     granulated_lds_size = 0
+; MOVREL-NEXT:     enable_exception = 0
+; MOVREL-NEXT:     enable_sgpr_private_segment_buffer = 1
+; MOVREL-NEXT:     enable_sgpr_dispatch_ptr = 0
+; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
+; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; MOVREL-NEXT:     enable_sgpr_dispatch_id = 0
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
+; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; MOVREL-NEXT:     enable_wavefront_size32 = 0
+; MOVREL-NEXT:     enable_ordered_append_gds = 0
+; MOVREL-NEXT:     private_element_size = 1
+; MOVREL-NEXT:     is_ptr64 = 1
+; MOVREL-NEXT:     is_dynamic_callstack = 0
+; MOVREL-NEXT:     is_debug_enabled = 0
+; MOVREL-NEXT:     is_xnack_enabled = 0
+; MOVREL-NEXT:     workitem_private_segment_byte_size = 0
+; MOVREL-NEXT:     workgroup_group_segment_byte_size = 0
+; MOVREL-NEXT:     gds_segment_byte_size = 0
+; MOVREL-NEXT:     kernarg_segment_byte_size = 28
+; MOVREL-NEXT:     workgroup_fbarrier_count = 0
+; MOVREL-NEXT:     wavefront_sgpr_count = 6
+; MOVREL-NEXT:     workitem_vgpr_count = 3
+; MOVREL-NEXT:     reserved_vgpr_first = 0
+; MOVREL-NEXT:     reserved_vgpr_count = 0
+; MOVREL-NEXT:     reserved_sgpr_first = 0
+; MOVREL-NEXT:     reserved_sgpr_count = 0
+; MOVREL-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; MOVREL-NEXT:     debug_private_segment_buffer_sgpr = 0
+; MOVREL-NEXT:     kernarg_segment_alignment = 4
+; MOVREL-NEXT:     group_segment_alignment = 4
+; MOVREL-NEXT:     private_segment_alignment = 4
+; MOVREL-NEXT:     wavefront_size = 6
+; MOVREL-NEXT:     call_convention = -1
+; MOVREL-NEXT:     runtime_loader_kernel_symbol = 0
+; MOVREL-NEXT:    .end_amd_kernel_code_t
+; MOVREL-NEXT:  ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; MOVREL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
+; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
+; MOVREL-NEXT:    s_cmp_eq_u32 s2, 1
+; MOVREL-NEXT:    s_cselect_b32 s3, 2.0, 1.0
+; MOVREL-NEXT:    s_cmp_eq_u32 s2, 2
+; MOVREL-NEXT:    s_cselect_b32 s3, 0x40400000, s3
+; MOVREL-NEXT:    s_cmp_eq_u32 s2, 3
+; MOVREL-NEXT:    s_cselect_b32 s2, 4.0, s3
+; MOVREL-NEXT:    v_mov_b32_e32 v2, s2
+; MOVREL-NEXT:    v_mov_b32_e32 v1, s1
+; MOVREL-NEXT:    flat_store_dword v[0:1], v2
+; MOVREL-NEXT:    s_endpgm
+entry:
+  %ext = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %sel
+  store float %ext, float addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i32 %sel) {
+; GPRIDX-LABEL: dyn_extract_v4f64_s_s_s:
+; GPRIDX:         .amd_kernel_code_t
+; GPRIDX-NEXT:     amd_code_version_major = 1
+; GPRIDX-NEXT:     amd_code_version_minor = 2
+; GPRIDX-NEXT:     amd_machine_kind = 1
+; GPRIDX-NEXT:     amd_machine_version_major = 9
+; GPRIDX-NEXT:     amd_machine_version_minor = 0
+; GPRIDX-NEXT:     amd_machine_version_stepping = 0
+; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
+; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
+; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 0
+; GPRIDX-NEXT:     priority = 0
+; GPRIDX-NEXT:     float_mode = 240
+; GPRIDX-NEXT:     priv = 0
+; GPRIDX-NEXT:     enable_dx10_clamp = 1
+; GPRIDX-NEXT:     debug_mode = 0
+; GPRIDX-NEXT:     enable_ieee_mode = 1
+; GPRIDX-NEXT:     enable_wgp_mode = 0
+; GPRIDX-NEXT:     enable_mem_ordered = 0
+; GPRIDX-NEXT:     enable_fwd_progress = 0
+; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GPRIDX-NEXT:     user_sgpr_count = 6
+; GPRIDX-NEXT:     enable_trap_handler = 0
+; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 0
+; GPRIDX-NEXT:     enable_sgpr_workgroup_id_z = 0
+; GPRIDX-NEXT:     enable_sgpr_workgroup_info = 0
+; GPRIDX-NEXT:     enable_vgpr_workitem_id = 0
+; GPRIDX-NEXT:     enable_exception_msb = 0
+; GPRIDX-NEXT:     granulated_lds_size = 0
+; GPRIDX-NEXT:     enable_exception = 0
+; GPRIDX-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GPRIDX-NEXT:     enable_sgpr_dispatch_ptr = 0
+; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
+; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 0
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
+; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GPRIDX-NEXT:     enable_wavefront_size32 = 0
+; GPRIDX-NEXT:     enable_ordered_append_gds = 0
+; GPRIDX-NEXT:     private_element_size = 1
+; GPRIDX-NEXT:     is_ptr64 = 1
+; GPRIDX-NEXT:     is_dynamic_callstack = 0
+; GPRIDX-NEXT:     is_debug_enabled = 0
+; GPRIDX-NEXT:     is_xnack_enabled = 0
+; GPRIDX-NEXT:     workitem_private_segment_byte_size = 0
+; GPRIDX-NEXT:     workgroup_group_segment_byte_size = 0
+; GPRIDX-NEXT:     gds_segment_byte_size = 0
+; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
+; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
+; GPRIDX-NEXT:     wavefront_sgpr_count = 7
+; GPRIDX-NEXT:     workitem_vgpr_count = 4
+; GPRIDX-NEXT:     reserved_vgpr_first = 0
+; GPRIDX-NEXT:     reserved_vgpr_count = 0
+; GPRIDX-NEXT:     reserved_sgpr_first = 0
+; GPRIDX-NEXT:     reserved_sgpr_count = 0
+; GPRIDX-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GPRIDX-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GPRIDX-NEXT:     kernarg_segment_alignment = 4
+; GPRIDX-NEXT:     group_segment_alignment = 4
+; GPRIDX-NEXT:     private_segment_alignment = 4
+; GPRIDX-NEXT:     wavefront_size = 6
+; GPRIDX-NEXT:     call_convention = -1
+; GPRIDX-NEXT:     runtime_loader_kernel_symbol = 0
+; GPRIDX-NEXT:    .end_amd_kernel_code_t
+; GPRIDX-NEXT:  ; %bb.0: ; %entry
+; GPRIDX-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GPRIDX-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GPRIDX-NEXT:    s_mov_b32 s0, 0
+; GPRIDX-NEXT:    s_mov_b32 s1, 0x40080000
+; GPRIDX-NEXT:    s_waitcnt lgkmcnt(0)
+; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
+; GPRIDX-NEXT:    s_cmp_eq_u32 s6, 1
+; GPRIDX-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
+; GPRIDX-NEXT:    s_cmp_eq_u32 s6, 2
+; GPRIDX-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GPRIDX-NEXT:    s_cmp_eq_u32 s6, 3
+; GPRIDX-NEXT:    s_cselect_b64 s[0:1], 4.0, s[0:1]
+; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
+; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
+; GPRIDX-NEXT:    v_mov_b32_e32 v3, s3
+; GPRIDX-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GPRIDX-NEXT:    s_endpgm
+;
+; MOVREL-LABEL: dyn_extract_v4f64_s_s_s:
+; MOVREL:         .amd_kernel_code_t
+; MOVREL-NEXT:     amd_code_version_major = 1
+; MOVREL-NEXT:     amd_code_version_minor = 2
+; MOVREL-NEXT:     amd_machine_kind = 1
+; MOVREL-NEXT:     amd_machine_version_major = 8
+; MOVREL-NEXT:     amd_machine_version_minor = 0
+; MOVREL-NEXT:     amd_machine_version_stepping = 3
+; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
+; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
+; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 0
+; MOVREL-NEXT:     priority = 0
+; MOVREL-NEXT:     float_mode = 240
+; MOVREL-NEXT:     priv = 0
+; MOVREL-NEXT:     enable_dx10_clamp = 1
+; MOVREL-NEXT:     debug_mode = 0
+; MOVREL-NEXT:     enable_ieee_mode = 1
+; MOVREL-NEXT:     enable_wgp_mode = 0
+; MOVREL-NEXT:     enable_mem_ordered = 0
+; MOVREL-NEXT:     enable_fwd_progress = 0
+; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; MOVREL-NEXT:     user_sgpr_count = 6
+; MOVREL-NEXT:     enable_trap_handler = 0
+; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
+; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 0
+; MOVREL-NEXT:     enable_sgpr_workgroup_id_z = 0
+; MOVREL-NEXT:     enable_sgpr_workgroup_info = 0
+; MOVREL-NEXT:     enable_vgpr_workitem_id = 0
+; MOVREL-NEXT:     enable_exception_msb = 0
+; MOVREL-NEXT:     granulated_lds_size = 0
+; MOVREL-NEXT:     enable_exception = 0
+; MOVREL-NEXT:     enable_sgpr_private_segment_buffer = 1
+; MOVREL-NEXT:     enable_sgpr_dispatch_ptr = 0
+; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
+; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; MOVREL-NEXT:     enable_sgpr_dispatch_id = 0
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
+; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; MOVREL-NEXT:     enable_wavefront_size32 = 0
+; MOVREL-NEXT:     enable_ordered_append_gds = 0
+; MOVREL-NEXT:     private_element_size = 1
+; MOVREL-NEXT:     is_ptr64 = 1
+; MOVREL-NEXT:     is_dynamic_callstack = 0
+; MOVREL-NEXT:     is_debug_enabled = 0
+; MOVREL-NEXT:     is_xnack_enabled = 0
+; MOVREL-NEXT:     workitem_private_segment_byte_size = 0
+; MOVREL-NEXT:     workgroup_group_segment_byte_size = 0
+; MOVREL-NEXT:     gds_segment_byte_size = 0
+; MOVREL-NEXT:     kernarg_segment_byte_size = 28
+; MOVREL-NEXT:     workgroup_fbarrier_count = 0
+; MOVREL-NEXT:     wavefront_sgpr_count = 7
+; MOVREL-NEXT:     workitem_vgpr_count = 4
+; MOVREL-NEXT:     reserved_vgpr_first = 0
+; MOVREL-NEXT:     reserved_vgpr_count = 0
+; MOVREL-NEXT:     reserved_sgpr_first = 0
+; MOVREL-NEXT:     reserved_sgpr_count = 0
+; MOVREL-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; MOVREL-NEXT:     debug_private_segment_buffer_sgpr = 0
+; MOVREL-NEXT:     kernarg_segment_alignment = 4
+; MOVREL-NEXT:     group_segment_alignment = 4
+; MOVREL-NEXT:     private_segment_alignment = 4
+; MOVREL-NEXT:     wavefront_size = 6
+; MOVREL-NEXT:     call_convention = -1
+; MOVREL-NEXT:     runtime_loader_kernel_symbol = 0
+; MOVREL-NEXT:    .end_amd_kernel_code_t
+; MOVREL-NEXT:  ; %bb.0: ; %entry
+; MOVREL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; MOVREL-NEXT:    s_load_dword s6, s[4:5], 0x8
+; MOVREL-NEXT:    s_mov_b32 s0, 0
+; MOVREL-NEXT:    s_mov_b32 s1, 0x40080000
+; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
+; MOVREL-NEXT:    v_mov_b32_e32 v2, s2
+; MOVREL-NEXT:    s_cmp_eq_u32 s6, 1
+; MOVREL-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
+; MOVREL-NEXT:    s_cmp_eq_u32 s6, 2
+; MOVREL-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; MOVREL-NEXT:    s_cmp_eq_u32 s6, 3
+; MOVREL-NEXT:    s_cselect_b64 s[0:1], 4.0, s[0:1]
+; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
+; MOVREL-NEXT:    v_mov_b32_e32 v1, s1
+; MOVREL-NEXT:    v_mov_b32_e32 v3, s3
+; MOVREL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; MOVREL-NEXT:    s_endpgm
+entry:
+  %ext = extractelement <4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, i32 %sel
+  store double %ext, double addrspace(1)* %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir
index 9ce09e46aae20..af9524c8c1f46 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir
@@ -42,53 +42,109 @@ body: |
     liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
 
     ; WAVE64-LABEL: name: extract_vector_elt_v16s32_sv
-    ; WAVE64: successors: %bb.1(0x80000000)
     ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-    ; WAVE64: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
-    ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-    ; WAVE64: .1:
-    ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1
-    ; WAVE64: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1
-    ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
-    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
-    ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-    ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE64: .2:
-    ; WAVE64: successors: %bb.3(0x80000000)
-    ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-    ; WAVE64: .3:
-    ; WAVE64: $vgpr0 = COPY [[V_MOV_B32_e32_]](s32)
+    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
+    ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; WAVE64: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV]]
+    ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE64: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; WAVE64: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV2]], [[SELECT]]
+    ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE64: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C2]]
+    ; WAVE64: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV3]], [[SELECT1]]
+    ; WAVE64: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE64: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C3]]
+    ; WAVE64: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV4]], [[SELECT2]]
+    ; WAVE64: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE64: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C4]]
+    ; WAVE64: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV5]], [[SELECT3]]
+    ; WAVE64: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C5]]
+    ; WAVE64: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV6]], [[SELECT4]]
+    ; WAVE64: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C6]]
+    ; WAVE64: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV7]], [[SELECT5]]
+    ; WAVE64: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8
+    ; WAVE64: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C7]]
+    ; WAVE64: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV8]], [[SELECT6]]
+    ; WAVE64: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 9
+    ; WAVE64: [[ICMP8:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C8]]
+    ; WAVE64: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP8]](s1), [[UV9]], [[SELECT7]]
+    ; WAVE64: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; WAVE64: [[ICMP9:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C9]]
+    ; WAVE64: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT8]]
+    ; WAVE64: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 11
+    ; WAVE64: [[ICMP10:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C10]]
+    ; WAVE64: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP10]](s1), [[UV11]], [[SELECT9]]
+    ; WAVE64: [[C11:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+    ; WAVE64: [[ICMP11:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C11]]
+    ; WAVE64: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP11]](s1), [[UV12]], [[SELECT10]]
+    ; WAVE64: [[C12:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 13
+    ; WAVE64: [[ICMP12:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C12]]
+    ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP12]](s1), [[UV13]], [[SELECT11]]
+    ; WAVE64: [[C13:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 14
+    ; WAVE64: [[ICMP13:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C13]]
+    ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP13]](s1), [[UV14]], [[SELECT12]]
+    ; WAVE64: [[C14:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15
+    ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C14]]
+    ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]]
+    ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32)
+    ; WAVE64: $vgpr0 = COPY [[COPY2]](s32)
     ; WAVE32-LABEL: name: extract_vector_elt_v16s32_sv
-    ; WAVE32: successors: %bb.1(0x80000000)
     ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
     ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-    ; WAVE32: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-    ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
-    ; WAVE32: .1:
-    ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1
-    ; WAVE32: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1
-    ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
-    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
-    ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
-    ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE32: .2:
-    ; WAVE32: successors: %bb.3(0x80000000)
-    ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]]
-    ; WAVE32: .3:
-    ; WAVE32: $vgpr0 = COPY [[V_MOV_B32_e32_]](s32)
+    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
+    ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; WAVE32: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV]]
+    ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE32: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; WAVE32: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV2]], [[SELECT]]
+    ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE32: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C2]]
+    ; WAVE32: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV3]], [[SELECT1]]
+    ; WAVE32: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE32: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C3]]
+    ; WAVE32: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV4]], [[SELECT2]]
+    ; WAVE32: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C4]]
+    ; WAVE32: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV5]], [[SELECT3]]
+    ; WAVE32: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE32: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C5]]
+    ; WAVE32: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV6]], [[SELECT4]]
+    ; WAVE32: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C6]]
+    ; WAVE32: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV7]], [[SELECT5]]
+    ; WAVE32: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8
+    ; WAVE32: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C7]]
+    ; WAVE32: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV8]], [[SELECT6]]
+    ; WAVE32: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 9
+    ; WAVE32: [[ICMP8:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C8]]
+    ; WAVE32: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP8]](s1), [[UV9]], [[SELECT7]]
+    ; WAVE32: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; WAVE32: [[ICMP9:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C9]]
+    ; WAVE32: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT8]]
+    ; WAVE32: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 11
+    ; WAVE32: [[ICMP10:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C10]]
+    ; WAVE32: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP10]](s1), [[UV11]], [[SELECT9]]
+    ; WAVE32: [[C11:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+    ; WAVE32: [[ICMP11:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C11]]
+    ; WAVE32: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP11]](s1), [[UV12]], [[SELECT10]]
+    ; WAVE32: [[C12:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 13
+    ; WAVE32: [[ICMP12:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C12]]
+    ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP12]](s1), [[UV13]], [[SELECT11]]
+    ; WAVE32: [[C13:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 14
+    ; WAVE32: [[ICMP13:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C13]]
+    ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP13]](s1), [[UV14]], [[SELECT12]]
+    ; WAVE32: [[C14:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15
+    ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C14]]
+    ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]]
+    ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32)
+    ; WAVE32: $vgpr0 = COPY [[COPY2]](s32)
     %0:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     %1:_(s32) = COPY $vgpr0
     %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %1
@@ -132,51 +188,109 @@ body: |
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
 
     ; WAVE64-LABEL: name: extract_vector_elt_v16s32_vv
-    ; WAVE64: successors: %bb.1(0x80000000)
     ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
-    ; WAVE64: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
-    ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-    ; WAVE64: .1:
-    ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1
-    ; WAVE64: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1
-    ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
-    ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-    ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE64: .2:
-    ; WAVE64: successors: %bb.3(0x80000000)
-    ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-    ; WAVE64: .3:
-    ; WAVE64: $vgpr0 = COPY [[EVEC]](s32)
+    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
+    ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
+    ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; WAVE64: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV]]
+    ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE64: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; WAVE64: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV2]], [[SELECT]]
+    ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE64: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C2]]
+    ; WAVE64: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV3]], [[SELECT1]]
+    ; WAVE64: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE64: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C3]]
+    ; WAVE64: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV4]], [[SELECT2]]
+    ; WAVE64: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE64: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C4]]
+    ; WAVE64: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV5]], [[SELECT3]]
+    ; WAVE64: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C5]]
+    ; WAVE64: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV6]], [[SELECT4]]
+    ; WAVE64: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C6]]
+    ; WAVE64: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV7]], [[SELECT5]]
+    ; WAVE64: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8
+    ; WAVE64: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C7]]
+    ; WAVE64: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV8]], [[SELECT6]]
+    ; WAVE64: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 9
+    ; WAVE64: [[ICMP8:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C8]]
+    ; WAVE64: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP8]](s1), [[UV9]], [[SELECT7]]
+    ; WAVE64: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; WAVE64: [[ICMP9:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C9]]
+    ; WAVE64: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT8]]
+    ; WAVE64: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 11
+    ; WAVE64: [[ICMP10:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C10]]
+    ; WAVE64: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP10]](s1), [[UV11]], [[SELECT9]]
+    ; WAVE64: [[C11:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+    ; WAVE64: [[ICMP11:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C11]]
+    ; WAVE64: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP11]](s1), [[UV12]], [[SELECT10]]
+    ; WAVE64: [[C12:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 13
+    ; WAVE64: [[ICMP12:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C12]]
+    ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP12]](s1), [[UV13]], [[SELECT11]]
+    ; WAVE64: [[C13:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 14
+    ; WAVE64: [[ICMP13:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C13]]
+    ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP13]](s1), [[UV14]], [[SELECT12]]
+    ; WAVE64: [[C14:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15
+    ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C14]]
+    ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]]
+    ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32)
+    ; WAVE64: $vgpr0 = COPY [[COPY2]](s32)
     ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv
-    ; WAVE32: successors: %bb.1(0x80000000)
     ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
-    ; WAVE32: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-    ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
-    ; WAVE32: .1:
-    ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %8, %bb.1
-    ; WAVE32: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1
-    ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
-    ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
-    ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE32: .2:
-    ; WAVE32: successors: %bb.3(0x80000000)
-    ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]]
-    ; WAVE32: .3:
-    ; WAVE32: $vgpr0 = COPY [[EVEC]](s32)
+    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
+    ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
+    ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; WAVE32: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV]]
+    ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE32: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; WAVE32: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV2]], [[SELECT]]
+    ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE32: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C2]]
+    ; WAVE32: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV3]], [[SELECT1]]
+    ; WAVE32: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE32: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C3]]
+    ; WAVE32: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV4]], [[SELECT2]]
+    ; WAVE32: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C4]]
+    ; WAVE32: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV5]], [[SELECT3]]
+    ; WAVE32: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE32: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C5]]
+    ; WAVE32: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV6]], [[SELECT4]]
+    ; WAVE32: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C6]]
+    ; WAVE32: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV7]], [[SELECT5]]
+    ; WAVE32: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8
+    ; WAVE32: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C7]]
+    ; WAVE32: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV8]], [[SELECT6]]
+    ; WAVE32: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 9
+    ; WAVE32: [[ICMP8:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C8]]
+    ; WAVE32: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP8]](s1), [[UV9]], [[SELECT7]]
+    ; WAVE32: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; WAVE32: [[ICMP9:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C9]]
+    ; WAVE32: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT8]]
+    ; WAVE32: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 11
+    ; WAVE32: [[ICMP10:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C10]]
+    ; WAVE32: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP10]](s1), [[UV11]], [[SELECT9]]
+    ; WAVE32: [[C11:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+    ; WAVE32: [[ICMP11:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C11]]
+    ; WAVE32: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP11]](s1), [[UV12]], [[SELECT10]]
+    ; WAVE32: [[C12:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 13
+    ; WAVE32: [[ICMP12:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C12]]
+    ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP12]](s1), [[UV13]], [[SELECT11]]
+    ; WAVE32: [[C13:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 14
+    ; WAVE32: [[ICMP13:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C13]]
+    ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP13]](s1), [[UV14]], [[SELECT12]]
+    ; WAVE32: [[C14:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15
+    ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C14]]
+    ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]]
+    ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32)
+    ; WAVE32: $vgpr0 = COPY [[COPY2]](s32)
     %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     %1:_(s32) = COPY $vgpr16
     %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %1
@@ -259,78 +373,78 @@ body: |
     liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
 
     ; WAVE64-LABEL: name: extract_vector_elt_v8s64_sv
-    ; WAVE64: successors: %bb.1(0x80000000)
     ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-    ; WAVE64: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
+    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>)
     ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE64: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
-    ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-    ; WAVE64: .1:
-    ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF4]], %bb.0, %20, %bb.1
-    ; WAVE64: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1
-    ; WAVE64: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1
-    ; WAVE64: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1
-    ; WAVE64: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1
-    ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32)
-    ; WAVE64: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
-    ; WAVE64: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
-    ; WAVE64: [[EVEC1:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32)
-    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
-    ; WAVE64: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC1]](s32), implicit $exec
-    ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-    ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE64: .2:
-    ; WAVE64: successors: %bb.3(0x80000000)
-    ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-    ; WAVE64: .3:
-    ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[V_MOV_B32_e32_]](s32), [[V_MOV_B32_e32_1]](s32)
+    ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; WAVE64: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV2]], [[UV]]
+    ; WAVE64: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV3]], [[UV1]]
+    ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE64: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; WAVE64: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV4]], [[SELECT]]
+    ; WAVE64: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV5]], [[SELECT1]]
+    ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE64: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C2]]
+    ; WAVE64: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV6]], [[SELECT2]]
+    ; WAVE64: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV7]], [[SELECT3]]
+    ; WAVE64: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE64: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C3]]
+    ; WAVE64: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV8]], [[SELECT4]]
+    ; WAVE64: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV9]], [[SELECT5]]
+    ; WAVE64: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE64: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C4]]
+    ; WAVE64: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV10]], [[SELECT6]]
+    ; WAVE64: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV11]], [[SELECT7]]
+    ; WAVE64: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C5]]
+    ; WAVE64: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV12]], [[SELECT8]]
+    ; WAVE64: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV13]], [[SELECT9]]
+    ; WAVE64: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C6]]
+    ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]]
+    ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]]
+    ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32)
+    ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32)
+    ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
     ; WAVE64: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ; WAVE32-LABEL: name: extract_vector_elt_v8s64_sv
-    ; WAVE32: successors: %bb.1(0x80000000)
     ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
     ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
-    ; WAVE32: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
+    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>)
     ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE32: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-    ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
-    ; WAVE32: .1:
-    ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF4]], %bb.0, %20, %bb.1
-    ; WAVE32: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1
-    ; WAVE32: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1
-    ; WAVE32: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1
-    ; WAVE32: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1
-    ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32)
-    ; WAVE32: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
-    ; WAVE32: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
-    ; WAVE32: [[EVEC1:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32)
-    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
-    ; WAVE32: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC1]](s32), implicit $exec
-    ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
-    ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE32: .2:
-    ; WAVE32: successors: %bb.3(0x80000000)
-    ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]]
-    ; WAVE32: .3:
-    ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[V_MOV_B32_e32_]](s32), [[V_MOV_B32_e32_1]](s32)
+    ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; WAVE32: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV2]], [[UV]]
+    ; WAVE32: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV3]], [[UV1]]
+    ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE32: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; WAVE32: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV4]], [[SELECT]]
+    ; WAVE32: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV5]], [[SELECT1]]
+    ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE32: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C2]]
+    ; WAVE32: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV6]], [[SELECT2]]
+    ; WAVE32: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV7]], [[SELECT3]]
+    ; WAVE32: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE32: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C3]]
+    ; WAVE32: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV8]], [[SELECT4]]
+    ; WAVE32: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV9]], [[SELECT5]]
+    ; WAVE32: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C4]]
+    ; WAVE32: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV10]], [[SELECT6]]
+    ; WAVE32: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV11]], [[SELECT7]]
+    ; WAVE32: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE32: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C5]]
+    ; WAVE32: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV12]], [[SELECT8]]
+    ; WAVE32: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV13]], [[SELECT9]]
+    ; WAVE32: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C6]]
+    ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]]
+    ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]]
+    ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32)
+    ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32)
+    ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
     ; WAVE32: $vgpr0_vgpr1 = COPY [[MV]](s64)
     %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     %1:_(s32) = COPY $vgpr0
@@ -348,74 +462,78 @@ body: |
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
 
     ; WAVE64-LABEL: name: extract_vector_elt_v8s64_vv
-    ; WAVE64: successors: %bb.1(0x80000000)
     ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
-    ; WAVE64: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
+    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
+    ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>)
     ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE64: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
-    ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-    ; WAVE64: .1:
-    ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF4]], %bb.0, %20, %bb.1
-    ; WAVE64: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1
-    ; WAVE64: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1
-    ; WAVE64: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1
-    ; WAVE64: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1
-    ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32)
-    ; WAVE64: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
-    ; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
-    ; WAVE64: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32)
-    ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-    ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE64: .2:
-    ; WAVE64: successors: %bb.3(0x80000000)
-    ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-    ; WAVE64: .3:
-    ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32)
+    ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; WAVE64: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV2]], [[UV]]
+    ; WAVE64: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV3]], [[UV1]]
+    ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE64: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; WAVE64: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV4]], [[SELECT]]
+    ; WAVE64: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV5]], [[SELECT1]]
+    ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE64: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C2]]
+    ; WAVE64: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV6]], [[SELECT2]]
+    ; WAVE64: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV7]], [[SELECT3]]
+    ; WAVE64: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE64: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C3]]
+    ; WAVE64: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV8]], [[SELECT4]]
+    ; WAVE64: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV9]], [[SELECT5]]
+    ; WAVE64: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE64: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C4]]
+    ; WAVE64: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV10]], [[SELECT6]]
+    ; WAVE64: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV11]], [[SELECT7]]
+    ; WAVE64: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C5]]
+    ; WAVE64: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV12]], [[SELECT8]]
+    ; WAVE64: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV13]], [[SELECT9]]
+    ; WAVE64: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C6]]
+    ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]]
+    ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]]
+    ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32)
+    ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32)
+    ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
     ; WAVE64: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ; WAVE32-LABEL: name: extract_vector_elt_v8s64_vv
-    ; WAVE32: successors: %bb.1(0x80000000)
     ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
-    ; WAVE32: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
+    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
+    ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>)
     ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE32: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-    ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
-    ; WAVE32: .1:
-    ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF4]], %bb.0, %20, %bb.1
-    ; WAVE32: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1
-    ; WAVE32: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1
-    ; WAVE32: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1
-    ; WAVE32: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1
-    ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32)
-    ; WAVE32: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
-    ; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
-    ; WAVE32: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32)
-    ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
-    ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE32: .2:
-    ; WAVE32: successors: %bb.3(0x80000000)
-    ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]]
-    ; WAVE32: .3:
-    ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32)
+    ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; WAVE32: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV2]], [[UV]]
+    ; WAVE32: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV3]], [[UV1]]
+    ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE32: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; WAVE32: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV4]], [[SELECT]]
+    ; WAVE32: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV5]], [[SELECT1]]
+    ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE32: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C2]]
+    ; WAVE32: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV6]], [[SELECT2]]
+    ; WAVE32: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV7]], [[SELECT3]]
+    ; WAVE32: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE32: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C3]]
+    ; WAVE32: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV8]], [[SELECT4]]
+    ; WAVE32: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV9]], [[SELECT5]]
+    ; WAVE32: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C4]]
+    ; WAVE32: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV10]], [[SELECT6]]
+    ; WAVE32: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV11]], [[SELECT7]]
+    ; WAVE32: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE32: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C5]]
+    ; WAVE32: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV12]], [[SELECT8]]
+    ; WAVE32: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV13]], [[SELECT9]]
+    ; WAVE32: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C6]]
+    ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]]
+    ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]]
+    ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32)
+    ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32)
+    ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
     ; WAVE32: $vgpr0_vgpr1 = COPY [[MV]](s64)
     %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     %1:_(s32) = COPY $vgpr16
@@ -433,61 +551,115 @@ body: |
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
 
     ; WAVE64-LABEL: name: extract_vector_elt_v16s32_vv_idx_add1
-    ; WAVE64: successors: %bb.1(0x80000000)
     ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
+    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
     ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
     ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
-    ; WAVE64: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
-    ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-    ; WAVE64: .1:
-    ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
-    ; WAVE64: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %4(s32), %bb.1
-    ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
+    ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
     ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE64: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C1]]
-    ; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[ADD1]](s32)
-    ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-    ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE64: .2:
-    ; WAVE64: successors: %bb.3(0x80000000)
-    ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-    ; WAVE64: .3:
-    ; WAVE64: $vgpr0 = COPY [[EVEC]](s32)
+    ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]]
+    ; WAVE64: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV]]
+    ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE64: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C2]]
+    ; WAVE64: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV2]], [[SELECT]]
+    ; WAVE64: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE64: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C3]]
+    ; WAVE64: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV3]], [[SELECT1]]
+    ; WAVE64: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE64: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C4]]
+    ; WAVE64: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV4]], [[SELECT2]]
+    ; WAVE64: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE64: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+    ; WAVE64: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV5]], [[SELECT3]]
+    ; WAVE64: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C6]]
+    ; WAVE64: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV6]], [[SELECT4]]
+    ; WAVE64: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]]
+    ; WAVE64: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV7]], [[SELECT5]]
+    ; WAVE64: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8
+    ; WAVE64: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C8]]
+    ; WAVE64: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV8]], [[SELECT6]]
+    ; WAVE64: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 9
+    ; WAVE64: [[ICMP8:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C9]]
+    ; WAVE64: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP8]](s1), [[UV9]], [[SELECT7]]
+    ; WAVE64: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; WAVE64: [[ICMP9:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C10]]
+    ; WAVE64: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT8]]
+    ; WAVE64: [[C11:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 11
+    ; WAVE64: [[ICMP10:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C11]]
+    ; WAVE64: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP10]](s1), [[UV11]], [[SELECT9]]
+    ; WAVE64: [[C12:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+    ; WAVE64: [[ICMP11:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C12]]
+    ; WAVE64: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP11]](s1), [[UV12]], [[SELECT10]]
+    ; WAVE64: [[C13:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 13
+    ; WAVE64: [[ICMP12:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C13]]
+    ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP12]](s1), [[UV13]], [[SELECT11]]
+    ; WAVE64: [[C14:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 14
+    ; WAVE64: [[ICMP13:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C14]]
+    ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP13]](s1), [[UV14]], [[SELECT12]]
+    ; WAVE64: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15
+    ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]]
+    ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]]
+    ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32)
+    ; WAVE64: $vgpr0 = COPY [[COPY3]](s32)
     ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv_idx_add1
-    ; WAVE32: successors: %bb.1(0x80000000)
     ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
+    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
     ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
     ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
-    ; WAVE32: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-    ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
-    ; WAVE32: .1:
-    ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
-    ; WAVE32: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %4(s32), %bb.1
-    ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
+    ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
     ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE32: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C1]]
-    ; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[ADD1]](s32)
-    ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
-    ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE32: .2:
-    ; WAVE32: successors: %bb.3(0x80000000)
-    ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]]
-    ; WAVE32: .3:
-    ; WAVE32: $vgpr0 = COPY [[EVEC]](s32)
+    ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]]
+    ; WAVE32: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV]]
+    ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE32: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C2]]
+    ; WAVE32: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV2]], [[SELECT]]
+    ; WAVE32: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE32: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C3]]
+    ; WAVE32: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV3]], [[SELECT1]]
+    ; WAVE32: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE32: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C4]]
+    ; WAVE32: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV4]], [[SELECT2]]
+    ; WAVE32: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+    ; WAVE32: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV5]], [[SELECT3]]
+    ; WAVE32: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE32: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C6]]
+    ; WAVE32: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV6]], [[SELECT4]]
+    ; WAVE32: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]]
+    ; WAVE32: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV7]], [[SELECT5]]
+    ; WAVE32: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8
+    ; WAVE32: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C8]]
+    ; WAVE32: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV8]], [[SELECT6]]
+    ; WAVE32: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 9
+    ; WAVE32: [[ICMP8:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C9]]
+    ; WAVE32: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP8]](s1), [[UV9]], [[SELECT7]]
+    ; WAVE32: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; WAVE32: [[ICMP9:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C10]]
+    ; WAVE32: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT8]]
+    ; WAVE32: [[C11:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 11
+    ; WAVE32: [[ICMP10:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C11]]
+    ; WAVE32: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP10]](s1), [[UV11]], [[SELECT9]]
+    ; WAVE32: [[C12:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+    ; WAVE32: [[ICMP11:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C12]]
+    ; WAVE32: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP11]](s1), [[UV12]], [[SELECT10]]
+    ; WAVE32: [[C13:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 13
+    ; WAVE32: [[ICMP12:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C13]]
+    ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP12]](s1), [[UV13]], [[SELECT11]]
+    ; WAVE32: [[C14:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 14
+    ; WAVE32: [[ICMP13:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C14]]
+    ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP13]](s1), [[UV14]], [[SELECT12]]
+    ; WAVE32: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15
+    ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]]
+    ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]]
+    ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32)
+    ; WAVE32: $vgpr0 = COPY [[COPY3]](s32)
     %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     %1:_(s32) = COPY $vgpr16
     %2:_(s32) = G_CONSTANT i32 1
@@ -506,57 +678,115 @@ body: |
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
 
     ; WAVE64-LABEL: name: extract_vector_elt_v16s32_vv_idx_addm1
-    ; WAVE64: successors: %bb.1(0x80000000)
     ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
     ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
     ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
-    ; WAVE64: [[ADD:%[0-9]+]]:vgpr_32(s32) = G_ADD [[COPY1]], [[COPY2]]
-    ; WAVE64: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
-    ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-    ; WAVE64: .1:
-    ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
-    ; WAVE64: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %4(s32), %bb.1
-    ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[ADD]](s32), implicit $exec
-    ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[ADD]](s32), implicit $exec
-    ; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
-    ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-    ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE64: .2:
-    ; WAVE64: successors: %bb.3(0x80000000)
-    ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-    ; WAVE64: .3:
-    ; WAVE64: $vgpr0 = COPY [[EVEC]](s32)
+    ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
+    ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
+    ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]]
+    ; WAVE64: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV]]
+    ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE64: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C2]]
+    ; WAVE64: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV2]], [[SELECT]]
+    ; WAVE64: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE64: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C3]]
+    ; WAVE64: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV3]], [[SELECT1]]
+    ; WAVE64: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE64: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C4]]
+    ; WAVE64: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV4]], [[SELECT2]]
+    ; WAVE64: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE64: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+    ; WAVE64: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV5]], [[SELECT3]]
+    ; WAVE64: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C6]]
+    ; WAVE64: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV6]], [[SELECT4]]
+    ; WAVE64: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]]
+    ; WAVE64: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV7]], [[SELECT5]]
+    ; WAVE64: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8
+    ; WAVE64: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C8]]
+    ; WAVE64: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV8]], [[SELECT6]]
+    ; WAVE64: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 9
+    ; WAVE64: [[ICMP8:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C9]]
+    ; WAVE64: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP8]](s1), [[UV9]], [[SELECT7]]
+    ; WAVE64: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; WAVE64: [[ICMP9:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C10]]
+    ; WAVE64: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT8]]
+    ; WAVE64: [[C11:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 11
+    ; WAVE64: [[ICMP10:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C11]]
+    ; WAVE64: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP10]](s1), [[UV11]], [[SELECT9]]
+    ; WAVE64: [[C12:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+    ; WAVE64: [[ICMP11:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C12]]
+    ; WAVE64: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP11]](s1), [[UV12]], [[SELECT10]]
+    ; WAVE64: [[C13:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 13
+    ; WAVE64: [[ICMP12:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C13]]
+    ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP12]](s1), [[UV13]], [[SELECT11]]
+    ; WAVE64: [[C14:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 14
+    ; WAVE64: [[ICMP13:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C14]]
+    ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP13]](s1), [[UV14]], [[SELECT12]]
+    ; WAVE64: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15
+    ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]]
+    ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]]
+    ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32)
+    ; WAVE64: $vgpr0 = COPY [[COPY3]](s32)
     ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv_idx_addm1
-    ; WAVE32: successors: %bb.1(0x80000000)
     ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
     ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1
     ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
-    ; WAVE32: [[ADD:%[0-9]+]]:vgpr_32(s32) = G_ADD [[COPY1]], [[COPY2]]
-    ; WAVE32: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-    ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
-    ; WAVE32: .1:
-    ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
-    ; WAVE32: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %4(s32), %bb.1
-    ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[ADD]](s32), implicit $exec
-    ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[ADD]](s32), implicit $exec
-    ; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
-    ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
-    ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE32: .2:
-    ; WAVE32: successors: %bb.3(0x80000000)
-    ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]]
-    ; WAVE32: .3:
-    ; WAVE32: $vgpr0 = COPY [[EVEC]](s32)
+    ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
+    ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
+    ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]]
+    ; WAVE32: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV]]
+    ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE32: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C2]]
+    ; WAVE32: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV2]], [[SELECT]]
+    ; WAVE32: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE32: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C3]]
+    ; WAVE32: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV3]], [[SELECT1]]
+    ; WAVE32: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE32: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C4]]
+    ; WAVE32: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV4]], [[SELECT2]]
+    ; WAVE32: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+    ; WAVE32: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV5]], [[SELECT3]]
+    ; WAVE32: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE32: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C6]]
+    ; WAVE32: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV6]], [[SELECT4]]
+    ; WAVE32: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]]
+    ; WAVE32: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV7]], [[SELECT5]]
+    ; WAVE32: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8
+    ; WAVE32: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C8]]
+    ; WAVE32: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV8]], [[SELECT6]]
+    ; WAVE32: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 9
+    ; WAVE32: [[ICMP8:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C9]]
+    ; WAVE32: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP8]](s1), [[UV9]], [[SELECT7]]
+    ; WAVE32: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; WAVE32: [[ICMP9:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C10]]
+    ; WAVE32: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT8]]
+    ; WAVE32: [[C11:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 11
+    ; WAVE32: [[ICMP10:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C11]]
+    ; WAVE32: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP10]](s1), [[UV11]], [[SELECT9]]
+    ; WAVE32: [[C12:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+    ; WAVE32: [[ICMP11:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C12]]
+    ; WAVE32: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP11]](s1), [[UV12]], [[SELECT10]]
+    ; WAVE32: [[C13:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 13
+    ; WAVE32: [[ICMP12:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C13]]
+    ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP12]](s1), [[UV13]], [[SELECT11]]
+    ; WAVE32: [[C14:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 14
+    ; WAVE32: [[ICMP13:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C14]]
+    ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP13]](s1), [[UV14]], [[SELECT12]]
+    ; WAVE32: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15
+    ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]]
+    ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]]
+    ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32)
+    ; WAVE32: $vgpr0 = COPY [[COPY3]](s32)
     %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     %1:_(s32) = COPY $vgpr16
     %2:_(s32) = G_CONSTANT i32 -1
@@ -575,57 +805,115 @@ body: |
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
 
     ; WAVE64-LABEL: name: extract_vector_elt_v16s32_vv_idx_add16
-    ; WAVE64: successors: %bb.1(0x80000000)
     ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
     ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
     ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
-    ; WAVE64: [[ADD:%[0-9]+]]:vgpr_32(s32) = G_ADD [[COPY1]], [[COPY2]]
-    ; WAVE64: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
-    ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-    ; WAVE64: .1:
-    ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
-    ; WAVE64: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %4(s32), %bb.1
-    ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[ADD]](s32), implicit $exec
-    ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[ADD]](s32), implicit $exec
-    ; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
-    ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-    ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE64: .2:
-    ; WAVE64: successors: %bb.3(0x80000000)
-    ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-    ; WAVE64: .3:
-    ; WAVE64: $vgpr0 = COPY [[EVEC]](s32)
+    ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
+    ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
+    ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]]
+    ; WAVE64: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV]]
+    ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE64: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C2]]
+    ; WAVE64: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV2]], [[SELECT]]
+    ; WAVE64: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE64: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C3]]
+    ; WAVE64: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV3]], [[SELECT1]]
+    ; WAVE64: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE64: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C4]]
+    ; WAVE64: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV4]], [[SELECT2]]
+    ; WAVE64: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE64: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+    ; WAVE64: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV5]], [[SELECT3]]
+    ; WAVE64: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C6]]
+    ; WAVE64: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV6]], [[SELECT4]]
+    ; WAVE64: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]]
+    ; WAVE64: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV7]], [[SELECT5]]
+    ; WAVE64: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8
+    ; WAVE64: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C8]]
+    ; WAVE64: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV8]], [[SELECT6]]
+    ; WAVE64: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 9
+    ; WAVE64: [[ICMP8:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C9]]
+    ; WAVE64: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP8]](s1), [[UV9]], [[SELECT7]]
+    ; WAVE64: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; WAVE64: [[ICMP9:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C10]]
+    ; WAVE64: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT8]]
+    ; WAVE64: [[C11:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 11
+    ; WAVE64: [[ICMP10:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C11]]
+    ; WAVE64: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP10]](s1), [[UV11]], [[SELECT9]]
+    ; WAVE64: [[C12:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+    ; WAVE64: [[ICMP11:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C12]]
+    ; WAVE64: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP11]](s1), [[UV12]], [[SELECT10]]
+    ; WAVE64: [[C13:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 13
+    ; WAVE64: [[ICMP12:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C13]]
+    ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP12]](s1), [[UV13]], [[SELECT11]]
+    ; WAVE64: [[C14:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 14
+    ; WAVE64: [[ICMP13:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C14]]
+    ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP13]](s1), [[UV14]], [[SELECT12]]
+    ; WAVE64: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15
+    ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]]
+    ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]]
+    ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32)
+    ; WAVE64: $vgpr0 = COPY [[COPY3]](s32)
     ; WAVE32-LABEL: name: extract_vector_elt_v16s32_vv_idx_add16
-    ; WAVE32: successors: %bb.1(0x80000000)
     ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
     ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
     ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
-    ; WAVE32: [[ADD:%[0-9]+]]:vgpr_32(s32) = G_ADD [[COPY1]], [[COPY2]]
-    ; WAVE32: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-    ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
-    ; WAVE32: .1:
-    ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
-    ; WAVE32: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %4(s32), %bb.1
-    ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[ADD]](s32), implicit $exec
-    ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[ADD]](s32), implicit $exec
-    ; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
-    ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
-    ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE32: .2:
-    ; WAVE32: successors: %bb.3(0x80000000)
-    ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]]
-    ; WAVE32: .3:
-    ; WAVE32: $vgpr0 = COPY [[EVEC]](s32)
+    ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
+    ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
+    ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]]
+    ; WAVE32: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV]]
+    ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE32: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C2]]
+    ; WAVE32: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV2]], [[SELECT]]
+    ; WAVE32: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE32: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C3]]
+    ; WAVE32: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV3]], [[SELECT1]]
+    ; WAVE32: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE32: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C4]]
+    ; WAVE32: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV4]], [[SELECT2]]
+    ; WAVE32: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+    ; WAVE32: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV5]], [[SELECT3]]
+    ; WAVE32: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE32: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C6]]
+    ; WAVE32: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV6]], [[SELECT4]]
+    ; WAVE32: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]]
+    ; WAVE32: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV7]], [[SELECT5]]
+    ; WAVE32: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8
+    ; WAVE32: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C8]]
+    ; WAVE32: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV8]], [[SELECT6]]
+    ; WAVE32: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 9
+    ; WAVE32: [[ICMP8:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C9]]
+    ; WAVE32: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP8]](s1), [[UV9]], [[SELECT7]]
+    ; WAVE32: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; WAVE32: [[ICMP9:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C10]]
+    ; WAVE32: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT8]]
+    ; WAVE32: [[C11:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 11
+    ; WAVE32: [[ICMP10:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C11]]
+    ; WAVE32: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP10]](s1), [[UV11]], [[SELECT9]]
+    ; WAVE32: [[C12:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+    ; WAVE32: [[ICMP11:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C12]]
+    ; WAVE32: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP11]](s1), [[UV12]], [[SELECT10]]
+    ; WAVE32: [[C13:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 13
+    ; WAVE32: [[ICMP12:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C13]]
+    ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP12]](s1), [[UV13]], [[SELECT11]]
+    ; WAVE32: [[C14:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 14
+    ; WAVE32: [[ICMP13:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C14]]
+    ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP13]](s1), [[UV14]], [[SELECT12]]
+    ; WAVE32: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15
+    ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]]
+    ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]]
+    ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32)
+    ; WAVE32: $vgpr0 = COPY [[COPY3]](s32)
     %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     %1:_(s32) = COPY $vgpr16
     %2:_(s32) = G_CONSTANT i32 16
@@ -644,84 +932,84 @@ body: |
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
 
     ; WAVE64-LABEL: name: extract_vector_elt_v8s64_vv_idx_add1
-    ; WAVE64: successors: %bb.1(0x80000000)
     ; WAVE64: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
+    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
     ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
     ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
-    ; WAVE64: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
+    ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>)
     ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE64: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
-    ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-    ; WAVE64: .1:
-    ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF4]], %bb.0, %23, %bb.1
-    ; WAVE64: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %10(s32), %bb.1
-    ; WAVE64: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %11(s32), %bb.1
-    ; WAVE64: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %6(s32), %bb.1
-    ; WAVE64: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %7(s32), %bb.1
-    ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE64: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C2]]
-    ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[ADD1]], [[C1]](s32)
-    ; WAVE64: [[ADD2:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C1]]
-    ; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
-    ; WAVE64: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD2]](s32)
-    ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-    ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE64: .2:
-    ; WAVE64: successors: %bb.3(0x80000000)
-    ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-    ; WAVE64: .3:
-    ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32)
+    ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]]
+    ; WAVE64: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV2]], [[UV]]
+    ; WAVE64: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV3]], [[UV1]]
+    ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE64: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C2]]
+    ; WAVE64: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV4]], [[SELECT]]
+    ; WAVE64: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV5]], [[SELECT1]]
+    ; WAVE64: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE64: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C3]]
+    ; WAVE64: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV6]], [[SELECT2]]
+    ; WAVE64: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV7]], [[SELECT3]]
+    ; WAVE64: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE64: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C4]]
+    ; WAVE64: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV8]], [[SELECT4]]
+    ; WAVE64: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV9]], [[SELECT5]]
+    ; WAVE64: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE64: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+    ; WAVE64: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV10]], [[SELECT6]]
+    ; WAVE64: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV11]], [[SELECT7]]
+    ; WAVE64: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C6]]
+    ; WAVE64: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV12]], [[SELECT8]]
+    ; WAVE64: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV13]], [[SELECT9]]
+    ; WAVE64: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]]
+    ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]]
+    ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]]
+    ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32)
+    ; WAVE64: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32)
+    ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
     ; WAVE64: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ; WAVE32-LABEL: name: extract_vector_elt_v8s64_vv_idx_add1
-    ; WAVE32: successors: %bb.1(0x80000000)
     ; WAVE32: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16
+    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr16
     ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
     ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
-    ; WAVE32: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
+    ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>)
     ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE32: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-    ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
-    ; WAVE32: .1:
-    ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF4]], %bb.0, %23, %bb.1
-    ; WAVE32: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %10(s32), %bb.1
-    ; WAVE32: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %11(s32), %bb.1
-    ; WAVE32: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %6(s32), %bb.1
-    ; WAVE32: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %7(s32), %bb.1
-    ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE32: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C2]]
-    ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[ADD1]], [[C1]](s32)
-    ; WAVE32: [[ADD2:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C1]]
-    ; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
-    ; WAVE32: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD2]](s32)
-    ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
-    ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE32: .2:
-    ; WAVE32: successors: %bb.3(0x80000000)
-    ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]]
-    ; WAVE32: .3:
-    ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32)
+    ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]]
+    ; WAVE32: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV2]], [[UV]]
+    ; WAVE32: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV3]], [[UV1]]
+    ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE32: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C2]]
+    ; WAVE32: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV4]], [[SELECT]]
+    ; WAVE32: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV5]], [[SELECT1]]
+    ; WAVE32: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE32: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C3]]
+    ; WAVE32: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV6]], [[SELECT2]]
+    ; WAVE32: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV7]], [[SELECT3]]
+    ; WAVE32: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE32: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C4]]
+    ; WAVE32: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV8]], [[SELECT4]]
+    ; WAVE32: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV9]], [[SELECT5]]
+    ; WAVE32: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+    ; WAVE32: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV10]], [[SELECT6]]
+    ; WAVE32: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV11]], [[SELECT7]]
+    ; WAVE32: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE32: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C6]]
+    ; WAVE32: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV12]], [[SELECT8]]
+    ; WAVE32: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV13]], [[SELECT9]]
+    ; WAVE32: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]]
+    ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]]
+    ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]]
+    ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32)
+    ; WAVE32: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32)
+    ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
     ; WAVE32: $vgpr0_vgpr1 = COPY [[MV]](s64)
     %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     %1:_(s32) = COPY $vgpr16
@@ -741,63 +1029,115 @@ body: |
     liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
 
     ; WAVE64-LABEL: name: extract_vector_elt_v16s32_sv_idx_add1
-    ; WAVE64: successors: %bb.1(0x80000000)
     ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
     ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
-    ; WAVE64: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
-    ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-    ; WAVE64: .1:
-    ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
-    ; WAVE64: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %4(s32), %bb.1
-    ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
+    ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
     ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE64: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C1]]
-    ; WAVE64: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[ADD1]](s32)
-    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
-    ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-    ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE64: .2:
-    ; WAVE64: successors: %bb.3(0x80000000)
-    ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-    ; WAVE64: .3:
-    ; WAVE64: $vgpr0 = COPY [[V_MOV_B32_e32_]](s32)
+    ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]]
+    ; WAVE64: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV]]
+    ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE64: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C2]]
+    ; WAVE64: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV2]], [[SELECT]]
+    ; WAVE64: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE64: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C3]]
+    ; WAVE64: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV3]], [[SELECT1]]
+    ; WAVE64: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE64: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C4]]
+    ; WAVE64: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV4]], [[SELECT2]]
+    ; WAVE64: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE64: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+    ; WAVE64: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV5]], [[SELECT3]]
+    ; WAVE64: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C6]]
+    ; WAVE64: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV6]], [[SELECT4]]
+    ; WAVE64: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]]
+    ; WAVE64: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV7]], [[SELECT5]]
+    ; WAVE64: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8
+    ; WAVE64: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C8]]
+    ; WAVE64: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV8]], [[SELECT6]]
+    ; WAVE64: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 9
+    ; WAVE64: [[ICMP8:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C9]]
+    ; WAVE64: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP8]](s1), [[UV9]], [[SELECT7]]
+    ; WAVE64: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; WAVE64: [[ICMP9:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C10]]
+    ; WAVE64: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT8]]
+    ; WAVE64: [[C11:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 11
+    ; WAVE64: [[ICMP10:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C11]]
+    ; WAVE64: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP10]](s1), [[UV11]], [[SELECT9]]
+    ; WAVE64: [[C12:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+    ; WAVE64: [[ICMP11:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C12]]
+    ; WAVE64: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP11]](s1), [[UV12]], [[SELECT10]]
+    ; WAVE64: [[C13:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 13
+    ; WAVE64: [[ICMP12:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C13]]
+    ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP12]](s1), [[UV13]], [[SELECT11]]
+    ; WAVE64: [[C14:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 14
+    ; WAVE64: [[ICMP13:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C14]]
+    ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP13]](s1), [[UV14]], [[SELECT12]]
+    ; WAVE64: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15
+    ; WAVE64: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]]
+    ; WAVE64: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]]
+    ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32)
+    ; WAVE64: $vgpr0 = COPY [[COPY3]](s32)
     ; WAVE32-LABEL: name: extract_vector_elt_v16s32_sv_idx_add1
-    ; WAVE32: successors: %bb.1(0x80000000)
     ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
     ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
     ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
-    ; WAVE32: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-    ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
-    ; WAVE32: .1:
-    ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]], %bb.0, %11, %bb.1
-    ; WAVE32: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %4(s32), %bb.1
-    ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
+    ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
     ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE32: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C1]]
-    ; WAVE32: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[ADD1]](s32)
-    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
-    ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
-    ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE32: .2:
-    ; WAVE32: successors: %bb.3(0x80000000)
-    ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]]
-    ; WAVE32: .3:
-    ; WAVE32: $vgpr0 = COPY [[V_MOV_B32_e32_]](s32)
+    ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]]
+    ; WAVE32: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV]]
+    ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE32: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C2]]
+    ; WAVE32: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV2]], [[SELECT]]
+    ; WAVE32: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE32: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C3]]
+    ; WAVE32: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV3]], [[SELECT1]]
+    ; WAVE32: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE32: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C4]]
+    ; WAVE32: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV4]], [[SELECT2]]
+    ; WAVE32: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+    ; WAVE32: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV5]], [[SELECT3]]
+    ; WAVE32: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE32: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C6]]
+    ; WAVE32: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV6]], [[SELECT4]]
+    ; WAVE32: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]]
+    ; WAVE32: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV7]], [[SELECT5]]
+    ; WAVE32: [[C8:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8
+    ; WAVE32: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C8]]
+    ; WAVE32: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV8]], [[SELECT6]]
+    ; WAVE32: [[C9:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 9
+    ; WAVE32: [[ICMP8:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C9]]
+    ; WAVE32: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP8]](s1), [[UV9]], [[SELECT7]]
+    ; WAVE32: [[C10:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 10
+    ; WAVE32: [[ICMP9:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C10]]
+    ; WAVE32: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT8]]
+    ; WAVE32: [[C11:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 11
+    ; WAVE32: [[ICMP10:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C11]]
+    ; WAVE32: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP10]](s1), [[UV11]], [[SELECT9]]
+    ; WAVE32: [[C12:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12
+    ; WAVE32: [[ICMP11:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C12]]
+    ; WAVE32: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP11]](s1), [[UV12]], [[SELECT10]]
+    ; WAVE32: [[C13:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 13
+    ; WAVE32: [[ICMP12:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C13]]
+    ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP12]](s1), [[UV13]], [[SELECT11]]
+    ; WAVE32: [[C14:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 14
+    ; WAVE32: [[ICMP13:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C14]]
+    ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP13]](s1), [[UV14]], [[SELECT12]]
+    ; WAVE32: [[C15:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 15
+    ; WAVE32: [[ICMP14:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C15]]
+    ; WAVE32: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP14]](s1), [[UV15]], [[SELECT13]]
+    ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT14]](s32)
+    ; WAVE32: $vgpr0 = COPY [[COPY3]](s32)
     %0:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     %1:_(s32) = COPY $vgpr0
     %2:_(s32) = G_CONSTANT i32 1
@@ -816,88 +1156,84 @@ body: |
     liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
 
     ; WAVE64-LABEL: name: extract_vector_elt_v8s64_sv_add1
-    ; WAVE64: successors: %bb.1(0x80000000)
     ; WAVE64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+    ; WAVE64: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
     ; WAVE64: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; WAVE64: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
-    ; WAVE64: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
+    ; WAVE64: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>)
     ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE64: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE64: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
-    ; WAVE64: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
-    ; WAVE64: .1:
-    ; WAVE64: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE64: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF4]], %bb.0, %23, %bb.1
-    ; WAVE64: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %10(s32), %bb.1
-    ; WAVE64: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %11(s32), %bb.1
-    ; WAVE64: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %6(s32), %bb.1
-    ; WAVE64: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %7(s32), %bb.1
-    ; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
-    ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE64: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C2]]
-    ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[ADD1]], [[C1]](s32)
-    ; WAVE64: [[ADD2:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C1]]
-    ; WAVE64: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
-    ; WAVE64: [[EVEC1:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD2]](s32)
-    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
-    ; WAVE64: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC1]](s32), implicit $exec
-    ; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-    ; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE64: .2:
-    ; WAVE64: successors: %bb.3(0x80000000)
-    ; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
-    ; WAVE64: .3:
-    ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[V_MOV_B32_e32_]](s32), [[V_MOV_B32_e32_1]](s32)
+    ; WAVE64: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]]
+    ; WAVE64: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV2]], [[UV]]
+    ; WAVE64: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV3]], [[UV1]]
+    ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE64: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C2]]
+    ; WAVE64: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV4]], [[SELECT]]
+    ; WAVE64: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV5]], [[SELECT1]]
+    ; WAVE64: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE64: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C3]]
+    ; WAVE64: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV6]], [[SELECT2]]
+    ; WAVE64: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV7]], [[SELECT3]]
+    ; WAVE64: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE64: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C4]]
+    ; WAVE64: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV8]], [[SELECT4]]
+    ; WAVE64: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV9]], [[SELECT5]]
+    ; WAVE64: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE64: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+    ; WAVE64: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV10]], [[SELECT6]]
+    ; WAVE64: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV11]], [[SELECT7]]
+    ; WAVE64: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE64: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C6]]
+    ; WAVE64: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV12]], [[SELECT8]]
+    ; WAVE64: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV13]], [[SELECT9]]
+    ; WAVE64: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE64: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]]
+    ; WAVE64: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]]
+    ; WAVE64: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]]
+    ; WAVE64: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32)
+    ; WAVE64: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32)
+    ; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
     ; WAVE64: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ; WAVE32-LABEL: name: extract_vector_elt_v8s64_sv_add1
-    ; WAVE32: successors: %bb.1(0x80000000)
     ; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
     ; WAVE32: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+    ; WAVE32: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
     ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
     ; WAVE32: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; WAVE32: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY1]], [[COPY2]]
-    ; WAVE32: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>)
+    ; WAVE32: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>)
     ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE32: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
-    ; WAVE32: [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-    ; WAVE32: [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
-    ; WAVE32: .1:
-    ; WAVE32: successors: %bb.2(0x40000000), %bb.1(0x40000000)
-    ; WAVE32: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF4]], %bb.0, %23, %bb.1
-    ; WAVE32: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %10(s32), %bb.1
-    ; WAVE32: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %11(s32), %bb.1
-    ; WAVE32: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %6(s32), %bb.1
-    ; WAVE32: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %7(s32), %bb.1
-    ; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
-    ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
-    ; WAVE32: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[V_READFIRSTLANE_B32_]], [[C2]]
-    ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[ADD1]], [[C1]](s32)
-    ; WAVE32: [[ADD2:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C1]]
-    ; WAVE32: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
-    ; WAVE32: [[EVEC1:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD2]](s32)
-    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
-    ; WAVE32: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC1]](s32), implicit $exec
-    ; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
-    ; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
-    ; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
-    ; WAVE32: .2:
-    ; WAVE32: successors: %bb.3(0x80000000)
-    ; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]]
-    ; WAVE32: .3:
-    ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[V_MOV_B32_e32_]](s32), [[V_MOV_B32_e32_1]](s32)
+    ; WAVE32: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C1]]
+    ; WAVE32: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV2]], [[UV]]
+    ; WAVE32: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV3]], [[UV1]]
+    ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2
+    ; WAVE32: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C2]]
+    ; WAVE32: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV4]], [[SELECT]]
+    ; WAVE32: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV5]], [[SELECT1]]
+    ; WAVE32: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3
+    ; WAVE32: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C3]]
+    ; WAVE32: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV6]], [[SELECT2]]
+    ; WAVE32: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV7]], [[SELECT3]]
+    ; WAVE32: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4
+    ; WAVE32: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C4]]
+    ; WAVE32: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV8]], [[SELECT4]]
+    ; WAVE32: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV9]], [[SELECT5]]
+    ; WAVE32: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
+    ; WAVE32: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+    ; WAVE32: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV10]], [[SELECT6]]
+    ; WAVE32: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV11]], [[SELECT7]]
+    ; WAVE32: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
+    ; WAVE32: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C6]]
+    ; WAVE32: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV12]], [[SELECT8]]
+    ; WAVE32: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV13]], [[SELECT9]]
+    ; WAVE32: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7
+    ; WAVE32: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C7]]
+    ; WAVE32: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV14]], [[SELECT10]]
+    ; WAVE32: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV15]], [[SELECT11]]
+    ; WAVE32: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[SELECT12]](s32)
+    ; WAVE32: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[SELECT13]](s32)
+    ; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
     ; WAVE32: $vgpr0_vgpr1 = COPY [[MV]](s64)
     %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     %1:_(s32) = COPY $vgpr0

From 74bd98829d82312676a60c5c2d142e20691b2f13 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 5 Jun 2020 13:32:33 -0700
Subject: [PATCH 14/24] Migrate Binary::checkOffset from error_code to Error,
 NFC

In my use case, this saved 100ms of time doing one-time-initialization
for std::error_code().
---
 llvm/include/llvm/Object/Binary.h        |  8 ++++----
 llvm/include/llvm/Object/ELFObjectFile.h |  4 ++--
 llvm/lib/Object/COFFObjectFile.cpp       | 18 ++++++++++--------
 llvm/lib/Object/XCOFFObjectFile.cpp      |  4 ++--
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Object/Binary.h b/llvm/include/llvm/Object/Binary.h
index aa5e718f5e9bb..e95516f30a403 100644
--- a/llvm/include/llvm/Object/Binary.h
+++ b/llvm/include/llvm/Object/Binary.h
@@ -160,14 +160,14 @@ class Binary {
     return Triple::UnknownObjectFormat;
   }
 
-  static std::error_code checkOffset(MemoryBufferRef M, uintptr_t Addr,
-                                     const uint64_t Size) {
+  static Error checkOffset(MemoryBufferRef M, uintptr_t Addr,
+                           const uint64_t Size) {
     if (Addr + Size < Addr || Addr + Size < Size ||
         Addr + Size > uintptr_t(M.getBufferEnd()) ||
         Addr < uintptr_t(M.getBufferStart())) {
-      return object_error::unexpected_eof;
+      return errorCodeToError(object_error::unexpected_eof);
     }
-    return std::error_code();
+    return Error::success();
   }
 };
 
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index d7fdc5294a0ab..62ecd8b5a7e5c 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -744,10 +744,10 @@ ELFObjectFile<ELFT>::getSectionContents(DataRefImpl Sec) const {
   const Elf_Shdr *EShdr = getSection(Sec);
   if (EShdr->sh_type == ELF::SHT_NOBITS)
     return makeArrayRef((const uint8_t *)base(), 0);
-  if (std::error_code EC =
+  if (Error E =
           checkOffset(getMemoryBufferRef(),
                       (uintptr_t)base() + EShdr->sh_offset, EShdr->sh_size))
-    return errorCodeToError(EC);
+    return std::move(E);
   return makeArrayRef((const uint8_t *)base() + EShdr->sh_offset,
                       EShdr->sh_size);
 }
diff --git a/llvm/lib/Object/COFFObjectFile.cpp b/llvm/lib/Object/COFFObjectFile.cpp
index 78bcfb177ee5d..3d129592738c3 100644
--- a/llvm/lib/Object/COFFObjectFile.cpp
+++ b/llvm/lib/Object/COFFObjectFile.cpp
@@ -59,8 +59,8 @@ static std::error_code getObject(const T *&Obj, MemoryBufferRef M,
                                  const void *Ptr,
                                  const uint64_t Size = sizeof(T)) {
   uintptr_t Addr = uintptr_t(Ptr);
-  if (std::error_code EC = Binary::checkOffset(M, Addr, Size))
-    return EC;
+  if (Error E = Binary::checkOffset(M, Addr, Size))
+    return errorToErrorCode(std::move(E));
   Obj = reinterpret_cast<const T *>(Addr);
   return std::error_code();
 }
@@ -374,9 +374,11 @@ getFirstReloc(const coff_section *Sec, MemoryBufferRef M, const uint8_t *Base) {
     // relocations.
     begin++;
   }
-  if (Binary::checkOffset(M, uintptr_t(begin),
-                          sizeof(coff_relocation) * NumRelocs))
+  if (auto E = Binary::checkOffset(M, uintptr_t(begin),
+                                   sizeof(coff_relocation) * NumRelocs)) {
+    consumeError(std::move(E));
     return nullptr;
+  }
   return begin;
 }
 
@@ -555,8 +557,8 @@ std::error_code COFFObjectFile::initImportTablePtr() {
   uintptr_t IntPtr = 0;
   if (std::error_code EC = getRvaPtr(ImportTableRva, IntPtr))
     return EC;
-  if (std::error_code EC = checkOffset(Data, IntPtr, DataEntry->Size))
-    return EC;
+  if (Error E = checkOffset(Data, IntPtr, DataEntry->Size))
+    return errorToErrorCode(std::move(E));
   ImportDirectory = reinterpret_cast<
       const coff_import_directory_table_entry *>(IntPtr);
   return std::error_code();
@@ -1093,8 +1095,8 @@ Error COFFObjectFile::getSectionContents(const coff_section *Sec,
   // data, as there's nothing that says that is not allowed.
   uintptr_t ConStart = uintptr_t(base()) + Sec->PointerToRawData;
   uint32_t SectionSize = getSectionSize(Sec);
-  if (checkOffset(Data, ConStart, SectionSize))
-    return make_error<BinaryError>();
+  if (Error E = checkOffset(Data, ConStart, SectionSize))
+    return E;
   Res = makeArrayRef(reinterpret_cast<const uint8_t *>(ConStart), SectionSize);
   return Error::success();
 }
diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp
index d41afc8bdc24d..f75291d22eece 100644
--- a/llvm/lib/Object/XCOFFObjectFile.cpp
+++ b/llvm/lib/Object/XCOFFObjectFile.cpp
@@ -29,8 +29,8 @@ template <typename T>
 static Expected<const T *> getObject(MemoryBufferRef M, const void *Ptr,
                                      const uint64_t Size = sizeof(T)) {
   uintptr_t Addr = uintptr_t(Ptr);
-  if (std::error_code EC = Binary::checkOffset(M, Addr, Size))
-    return errorCodeToError(EC);
+  if (Error E = Binary::checkOffset(M, Addr, Size))
+    return std::move(E);
   return reinterpret_cast<const T *>(Addr);
 }
 

From 3408dcbdf054ac3cc32a97a6a82a3cf5844be609 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Fri, 5 Jun 2020 11:46:07 -0700
Subject: [PATCH 15/24] [X86] Fold undef elts to 0 in
 getTargetVShiftByConstNode.

Similar to D81212.

Differential Revision: https://reviews.llvm.org/D81292
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  9 +++++---
 llvm/test/CodeGen/X86/vec_shift5.ll     | 28 ++++++++++++-------------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 306b90671d517..467c21e0f5408 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23857,7 +23857,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
       for (unsigned i = 0; i != NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
         if (CurrentOp->isUndef()) {
-          Elts.push_back(CurrentOp);
+          // Must produce 0s in the correct bits.
+          Elts.push_back(DAG.getConstant(0, dl, ElementType));
           continue;
         }
         auto *ND = cast<ConstantSDNode>(CurrentOp);
@@ -23869,7 +23870,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
       for (unsigned i = 0; i != NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
         if (CurrentOp->isUndef()) {
-          Elts.push_back(CurrentOp);
+          // Must produce 0s in the correct bits.
+          Elts.push_back(DAG.getConstant(0, dl, ElementType));
           continue;
         }
         auto *ND = cast<ConstantSDNode>(CurrentOp);
@@ -23881,7 +23883,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
       for (unsigned i = 0; i != NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
         if (CurrentOp->isUndef()) {
-          Elts.push_back(CurrentOp);
+          // All shifted in bits must be the same so use 0.
+          Elts.push_back(DAG.getConstant(0, dl, ElementType));
           continue;
         }
         auto *ND = cast<ConstantSDNode>(CurrentOp);
diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll
index 5c84d7c748f07..eef51e3ed3657 100644
--- a/llvm/test/CodeGen/X86/vec_shift5.ll
+++ b/llvm/test/CodeGen/X86/vec_shift5.ll
@@ -121,12 +121,12 @@ define <2 x i64> @test8() {
 define <8 x i16> @test9() {
 ; X32-LABEL: test9:
 ; X32:       # %bb.0:
-; X32-NEXT:    movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X32-NEXT:    movaps {{.*#+}} xmm0 = [1,1,0,0,3,0,8,16]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test9:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [1,1,0,0,3,0,8,16]
 ; X64-NEXT:    retq
   %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
   ret <8 x i16> %1
@@ -135,12 +135,12 @@ define <8 x i16> @test9() {
 define <4 x i32> @test10() {
 ; X32-LABEL: test10:
 ; X32:       # %bb.0:
-; X32-NEXT:    movaps {{.*#+}} xmm0 = <u,1,u,4>
+; X32-NEXT:    movaps {{.*#+}} xmm0 = [0,1,0,4]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test10:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = <u,1,u,4>
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [0,1,0,4]
 ; X64-NEXT:    retq
   %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
   ret <4 x i32> %1
@@ -154,7 +154,7 @@ define <2 x i64> @test11() {
 ;
 ; X64-LABEL: test11:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = <u,3>
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
 ; X64-NEXT:    retq
   %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
   ret <2 x i64> %1
@@ -163,12 +163,12 @@ define <2 x i64> @test11() {
 define <8 x i16> @test12() {
 ; X32-LABEL: test12:
 ; X32:       # %bb.0:
-; X32-NEXT:    movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X32-NEXT:    movaps {{.*#+}} xmm0 = [1,1,0,0,3,0,8,16]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test12:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [1,1,0,0,3,0,8,16]
 ; X64-NEXT:    retq
   %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
   ret <8 x i16> %1
@@ -177,12 +177,12 @@ define <8 x i16> @test12() {
 define <4 x i32> @test13() {
 ; X32-LABEL: test13:
 ; X32:       # %bb.0:
-; X32-NEXT:    movaps {{.*#+}} xmm0 = <u,1,u,4>
+; X32-NEXT:    movaps {{.*#+}} xmm0 = [0,1,0,4]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test13:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = <u,1,u,4>
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [0,1,0,4]
 ; X64-NEXT:    retq
   %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
   ret <4 x i32> %1
@@ -191,12 +191,12 @@ define <4 x i32> @test13() {
 define <8 x i16> @test14() {
 ; X32-LABEL: test14:
 ; X32:       # %bb.0:
-; X32-NEXT:    movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X32-NEXT:    movaps {{.*#+}} xmm0 = [1,1,0,0,3,0,8,16]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test14:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [1,1,0,0,3,0,8,16]
 ; X64-NEXT:    retq
   %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
   ret <8 x i16> %1
@@ -205,12 +205,12 @@ define <8 x i16> @test14() {
 define <4 x i32> @test15() {
 ; X32-LABEL: test15:
 ; X32:       # %bb.0:
-; X32-NEXT:    movaps {{.*#+}} xmm0 = <u,64,u,256>
+; X32-NEXT:    movaps {{.*#+}} xmm0 = [0,64,0,256]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test15:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = <u,64,u,256>
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [0,64,0,256]
 ; X64-NEXT:    retq
   %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
   ret <4 x i32> %1
@@ -224,7 +224,7 @@ define <2 x i64> @test16() {
 ;
 ; X64-LABEL: test16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = <u,248>
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,248,0,0,0,0,0,0,0]
 ; X64-NEXT:    retq
   %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
   ret <2 x i64> %1

From f28177dbe8d2e2955f7ca0a0ffdb1a44fefe092d Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Fri, 5 Jun 2020 11:38:59 -0700
Subject: [PATCH 16/24] Factor out GetEnvDeveloperDir() (NFC)

Differential Revision: https://reviews.llvm.org/D81289
---
 .../Host/macosx/objcxx/HostInfoMacOSX.mm      | 34 ++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
index cf2f2dcb3aff8..37bcff24ba232 100644
--- a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
+++ b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
@@ -297,6 +297,19 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) {
   }
 }
 
+/// Return and cache $DEVELOPER_DIR if it is set and exists.
+static std::string GetEnvDeveloperDir() {
+  static std::string g_env_developer_dir;
+  static std::once_flag g_once_flag;
+  std::call_once(g_once_flag, [&]() {
+    if (const char *developer_dir_env_var = getenv("DEVELOPER_DIR")) {
+      FileSpec fspec(developer_dir_env_var);
+      if (FileSystem::Instance().Exists(fspec))
+        g_env_developer_dir = fspec.GetPath();
+    }});
+  return g_env_developer_dir;
+}
+
 FileSpec HostInfoMacOSX::GetXcodeContentsDirectory() {
   static FileSpec g_xcode_contents_path;
   static std::once_flag g_once_flag;
@@ -313,16 +326,14 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) {
       }
     }
 
-    if (const char *developer_dir_env_var = getenv("DEVELOPER_DIR")) {
-      FileSpec fspec(developer_dir_env_var);
-      if (FileSystem::Instance().Exists(fspec)) {
-        // FIXME: This looks like it couldn't possibly work!
-        std::string xcode_contents_dir =
-            XcodeSDK::FindXcodeContentsDirectoryInPath(fspec.GetPath());
-        if (!xcode_contents_dir.empty()) {
-          g_xcode_contents_path = FileSpec(xcode_contents_dir);
-          return;
-        }
+    std::string env_developer_dir = GetEnvDeveloperDir();
+    if (!env_developer_dir.empty()) {
+      // FIXME: This looks like it couldn't possibly work!
+      std::string xcode_contents_dir =
+          XcodeSDK::FindXcodeContentsDirectoryInPath(env_developer_dir);
+      if (!xcode_contents_dir.empty()) {
+        g_xcode_contents_path = FileSpec(xcode_contents_dir);
+        return;
       }
     }
 
@@ -359,8 +370,7 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) {
   std::string sdk_name = XcodeSDK::GetCanonicalName(info);
   auto find_sdk = [](std::string sdk_name) -> std::string {
     std::string xcrun_cmd;
-    Environment env = Host::GetEnvironment();
-    std::string developer_dir = env.lookup("DEVELOPER_DIR");
+    std::string developer_dir = GetEnvDeveloperDir();
     if (developer_dir.empty())
       if (FileSpec fspec = HostInfo::GetShlibDir())
         if (FileSystem::Instance().Exists(fspec)) {

From ad4e7b9dc82b13d124071f0add09cb541b495a0e Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Fri, 5 Jun 2020 11:58:05 -0700
Subject: [PATCH 17/24] Fix an oversight in GetXcodeContentsDirectory()

Since FindXcodeContentsDirectoryInPath expects the *.app/Contents and
DEVELOPER_DIR is supposed to point to Xcode.app, we need to append the
Contents path first.

Differential Revision: https://reviews.llvm.org/D81290
---
 lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
index 37bcff24ba232..fd88d0c31de6a 100644
--- a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
+++ b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
@@ -326,9 +326,9 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) {
       }
     }
 
-    std::string env_developer_dir = GetEnvDeveloperDir();
+    llvm::SmallString<128> env_developer_dir(GetEnvDeveloperDir());
     if (!env_developer_dir.empty()) {
-      // FIXME: This looks like it couldn't possibly work!
+      llvm::sys::path::append(env_developer_dir, "Contents");
       std::string xcode_contents_dir =
           XcodeSDK::FindXcodeContentsDirectoryInPath(env_developer_dir);
       if (!xcode_contents_dir.empty()) {

From 38f3ba591e3a64fa5bbe684b3171c7bda6c5b527 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 5 Jun 2020 17:00:20 -0400
Subject: [PATCH 18/24] Revert "Migrate Binary::checkOffset from error_code to
 Error, NFC"

This reverts commit 74bd98829d82312676a60c5c2d142e20691b2f13.
Breaks LLVM::section-headers.test everywhere, see e.g.
http://lab.llvm.org:8011/builders/clang-x86_64-debian-fast/builds/29940/steps/test-check-all/logs/FAIL%3A%20LLVM%3A%3Asection-headers.test
---
 llvm/include/llvm/Object/Binary.h        |  8 ++++----
 llvm/include/llvm/Object/ELFObjectFile.h |  4 ++--
 llvm/lib/Object/COFFObjectFile.cpp       | 18 ++++++++----------
 llvm/lib/Object/XCOFFObjectFile.cpp      |  4 ++--
 4 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/Object/Binary.h b/llvm/include/llvm/Object/Binary.h
index e95516f30a403..aa5e718f5e9bb 100644
--- a/llvm/include/llvm/Object/Binary.h
+++ b/llvm/include/llvm/Object/Binary.h
@@ -160,14 +160,14 @@ class Binary {
     return Triple::UnknownObjectFormat;
   }
 
-  static Error checkOffset(MemoryBufferRef M, uintptr_t Addr,
-                           const uint64_t Size) {
+  static std::error_code checkOffset(MemoryBufferRef M, uintptr_t Addr,
+                                     const uint64_t Size) {
     if (Addr + Size < Addr || Addr + Size < Size ||
         Addr + Size > uintptr_t(M.getBufferEnd()) ||
         Addr < uintptr_t(M.getBufferStart())) {
-      return errorCodeToError(object_error::unexpected_eof);
+      return object_error::unexpected_eof;
     }
-    return Error::success();
+    return std::error_code();
   }
 };
 
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index 62ecd8b5a7e5c..d7fdc5294a0ab 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -744,10 +744,10 @@ ELFObjectFile<ELFT>::getSectionContents(DataRefImpl Sec) const {
   const Elf_Shdr *EShdr = getSection(Sec);
   if (EShdr->sh_type == ELF::SHT_NOBITS)
     return makeArrayRef((const uint8_t *)base(), 0);
-  if (Error E =
+  if (std::error_code EC =
           checkOffset(getMemoryBufferRef(),
                       (uintptr_t)base() + EShdr->sh_offset, EShdr->sh_size))
-    return std::move(E);
+    return errorCodeToError(EC);
   return makeArrayRef((const uint8_t *)base() + EShdr->sh_offset,
                       EShdr->sh_size);
 }
diff --git a/llvm/lib/Object/COFFObjectFile.cpp b/llvm/lib/Object/COFFObjectFile.cpp
index 3d129592738c3..78bcfb177ee5d 100644
--- a/llvm/lib/Object/COFFObjectFile.cpp
+++ b/llvm/lib/Object/COFFObjectFile.cpp
@@ -59,8 +59,8 @@ static std::error_code getObject(const T *&Obj, MemoryBufferRef M,
                                  const void *Ptr,
                                  const uint64_t Size = sizeof(T)) {
   uintptr_t Addr = uintptr_t(Ptr);
-  if (Error E = Binary::checkOffset(M, Addr, Size))
-    return errorToErrorCode(std::move(E));
+  if (std::error_code EC = Binary::checkOffset(M, Addr, Size))
+    return EC;
   Obj = reinterpret_cast<const T *>(Addr);
   return std::error_code();
 }
@@ -374,11 +374,9 @@ getFirstReloc(const coff_section *Sec, MemoryBufferRef M, const uint8_t *Base) {
     // relocations.
     begin++;
   }
-  if (auto E = Binary::checkOffset(M, uintptr_t(begin),
-                                   sizeof(coff_relocation) * NumRelocs)) {
-    consumeError(std::move(E));
+  if (Binary::checkOffset(M, uintptr_t(begin),
+                          sizeof(coff_relocation) * NumRelocs))
     return nullptr;
-  }
   return begin;
 }
 
@@ -557,8 +555,8 @@ std::error_code COFFObjectFile::initImportTablePtr() {
   uintptr_t IntPtr = 0;
   if (std::error_code EC = getRvaPtr(ImportTableRva, IntPtr))
     return EC;
-  if (Error E = checkOffset(Data, IntPtr, DataEntry->Size))
-    return errorToErrorCode(std::move(E));
+  if (std::error_code EC = checkOffset(Data, IntPtr, DataEntry->Size))
+    return EC;
   ImportDirectory = reinterpret_cast<
       const coff_import_directory_table_entry *>(IntPtr);
   return std::error_code();
@@ -1095,8 +1093,8 @@ Error COFFObjectFile::getSectionContents(const coff_section *Sec,
   // data, as there's nothing that says that is not allowed.
   uintptr_t ConStart = uintptr_t(base()) + Sec->PointerToRawData;
   uint32_t SectionSize = getSectionSize(Sec);
-  if (Error E = checkOffset(Data, ConStart, SectionSize))
-    return E;
+  if (checkOffset(Data, ConStart, SectionSize))
+    return make_error<BinaryError>();
   Res = makeArrayRef(reinterpret_cast<const uint8_t *>(ConStart), SectionSize);
   return Error::success();
 }
diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp
index f75291d22eece..d41afc8bdc24d 100644
--- a/llvm/lib/Object/XCOFFObjectFile.cpp
+++ b/llvm/lib/Object/XCOFFObjectFile.cpp
@@ -29,8 +29,8 @@ template <typename T>
 static Expected<const T *> getObject(MemoryBufferRef M, const void *Ptr,
                                      const uint64_t Size = sizeof(T)) {
   uintptr_t Addr = uintptr_t(Ptr);
-  if (Error E = Binary::checkOffset(M, Addr, Size))
-    return std::move(E);
+  if (std::error_code EC = Binary::checkOffset(M, Addr, Size))
+    return errorCodeToError(EC);
   return reinterpret_cast<const T *>(Addr);
 }
 

From 2dd83a923046a5cd9585dbf9f90daeab6c37265c Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 5 Jun 2020 22:56:24 +0200
Subject: [PATCH 19/24] [ASan][Test] Fix globals test for Mach-O

Summary: Use a portable section name, as for the test's purpose any name will do.

Reviewers: nickdesaulniers, thakis

Reviewed By: thakis

Subscribers: cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D81306
---
 clang/test/CodeGen/asan-globals.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/CodeGen/asan-globals.cpp b/clang/test/CodeGen/asan-globals.cpp
index 2feb305ebecd1..f02bd173d31b4 100644
--- a/clang/test/CodeGen/asan-globals.cpp
+++ b/clang/test/CodeGen/asan-globals.cpp
@@ -11,7 +11,7 @@ int global;
 int dyn_init_global = global;
 int __attribute__((no_sanitize("address"))) attributed_global;
 int blacklisted_global;
-int __attribute__((section(".foo.bar"))) sectioned_global;
+int __attribute__ ((section("__DATA, __common"))) sectioned_global;
 
 void func() {
   static int static_var = 0;
@@ -41,7 +41,7 @@ void func() {
 // CHECK: ![[ATTR_GLOBAL]] = !{{{.*}}, null, null, i1 false, i1 true}
 // CHECK: ![[BLACKLISTED_GLOBAL]] = !{{{.*}}, null, null, i1 false, i1 true}
 // CHECK: ![[SECTIONED_GLOBAL]] = !{{{.*}} ![[SECTIONED_GLOBAL_LOC:[0-9]+]], !"sectioned_global", i1 false, i1 false}
-// CHECK: ![[SECTIONED_GLOBAL_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 14, i32 42}
+// CHECK: ![[SECTIONED_GLOBAL_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 14, i32 51}
 // CHECK: ![[STATIC_VAR]] = !{{{.*}} ![[STATIC_LOC:[0-9]+]], !"static_var", i1 false, i1 false}
 // CHECK: ![[STATIC_LOC]] = !{!"{{.*}}asan-globals.cpp", i32 17, i32 14}
 // CHECK: ![[LITERAL]] = !{{{.*}} ![[LITERAL_LOC:[0-9]+]], !"<string literal>", i1 false, i1 false}

From bff94a8e2bb93267a561ca96287f570af499b090 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Fri, 5 Jun 2020 23:11:04 +0200
Subject: [PATCH 20/24] [LoopIdiomRecognize] Remove unnecessary MaybeAlign use
 (NFC)

Loads and stores always have an alignment now.
---
 llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 5b93aad11e143..a20f369b69f8a 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1109,11 +1109,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   else {
     // We cannot allow unaligned ops for unordered load/store, so reject
     // anything where the alignment isn't at least the element size.
-    const MaybeAlign StoreAlign = SI->getAlign();
-    const MaybeAlign LoadAlign = LI->getAlign();
-    if (StoreAlign == None || LoadAlign == None)
-      return false;
-    if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
+    const Align StoreAlign = SI->getAlign();
+    const Align LoadAlign = LI->getAlign();
+    if (StoreAlign < StoreSize || LoadAlign < StoreSize)
       return false;
 
     // If the element.atomic memcpy is not lowered into explicit
@@ -1127,7 +1125,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
     // Note that unordered atomic loads/stores are *required* by the spec to
     // have an alignment but non-atomic loads/stores may not.
     NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
-        StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes,
+        StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
         StoreSize);
   }
   NewCall->setDebugLoc(SI->getDebugLoc());

From cb5724c71e396729b7c0dd6a2a8aff20444dee09 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Fri, 5 Jun 2020 23:18:26 +0200
Subject: [PATCH 21/24] [CGP] Remove unnecessary MaybeAlign use (NFC)

Stores now always have an alignment.
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 345b3d77d53da..4ed49757733ca 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -7083,13 +7083,13 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
     Value *Addr = Builder.CreateBitCast(
         SI.getOperand(1),
         SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
+    Align Alignment = SI.getAlign();
     const bool IsOffsetStore = (IsLE && Upper) || (!IsLE && !Upper);
-    if (IsOffsetStore)
+    if (IsOffsetStore) {
       Addr = Builder.CreateGEP(
           SplitStoreType, Addr,
           ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
-    MaybeAlign Alignment = SI.getAlign();
-    if (IsOffsetStore && Alignment) {
+
       // When splitting the store in half, naturally one half will retain the
       // alignment of the original wider store, regardless of whether it was
       // over-aligned or not, while the other will require adjustment.

From b6c88549bc8fbc55e0042d565012e58292625778 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Fri, 5 Jun 2020 17:23:15 -0400
Subject: [PATCH 22/24] [mlir] Fix spurious f64 -> f16 change in CPU runner
 test

---
 .../mlir-cpu-runner/sgemm_naive_codegen.mlir     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mlir/test/mlir-cpu-runner/sgemm_naive_codegen.mlir b/mlir/test/mlir-cpu-runner/sgemm_naive_codegen.mlir
index ef2fe6cdaa719..92149c722166f 100644
--- a/mlir/test/mlir-cpu-runner/sgemm_naive_codegen.mlir
+++ b/mlir/test/mlir-cpu-runner/sgemm_naive_codegen.mlir
@@ -12,13 +12,13 @@ func @main() {
 
   %reps = constant 1 : index
 
-  %t_start = call @rtclock() : () -> f16
+  %t_start = call @rtclock() : () -> f64
   affine.for %arg0 = 0 to 5 {
     linalg.fill(%C, %cf1) : memref<16x16xf32>, f32
     call @sgemm_naive(%A, %B, %C) : (memref<16x16xf32>, memref<16x16xf32>, memref<16x16xf32>) -> ()
   }
-  %t_end = call @rtclock() : () -> f16
-  %t = subf %t_end, %t_start : f16
+  %t_end = call @rtclock() : () -> f64
+  %t = subf %t_end, %t_start : f64
 
   %pC = memref_cast %C : memref<16x16xf32> to memref<*xf32>
   call @print_memref_f32(%pC) : (memref<*xf32>) -> ()
@@ -35,9 +35,9 @@ func @main() {
   %f3 = muli %c2, %f2 : index
   %num_flops = muli %reps, %f3 : index
   %num_flops_i = index_cast %num_flops : index to i16
-  %num_flops_f = sitofp %num_flops_i : i16 to f16
-  %flops = divf %num_flops_f, %t : f16
-  call @print_flops(%flops) : (f16) -> ()
+  %num_flops_f = sitofp %num_flops_i : i16 to f64
+  %flops = divf %num_flops_f, %t : f64
+  call @print_flops(%flops) : (f64) -> ()
 
   return
 }
@@ -66,6 +66,6 @@ func @sgemm_naive(%arg0: memref<16x16xf32>, %arg1: memref<16x16xf32>, %arg2: mem
   return
 }
 
-func @print_flops(f16)
-func @rtclock() -> f16
+func @print_flops(f64)
+func @rtclock() -> f64
 func @print_memref_f32(memref<*xf32>)

From 8b05b6d53342b4532b046f85cbbe3278ed4d59cb Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@gmail.com>
Date: Fri, 5 Jun 2020 14:27:04 -0700
Subject: [PATCH 23/24] [X86] Add test cases for PR46203. NFC

---
 llvm/test/CodeGen/X86/dagcombine-select.ll | 51 ++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll
index 506234b25f27c..a6a81b975705b 100644
--- a/llvm/test/CodeGen/X86/dagcombine-select.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-select.ll
@@ -430,3 +430,54 @@ define i32 @cttz_32_ne_select_ffs(i32 %v) nounwind {
   %add = select i1 %tobool, i32 %.op, i32 0
   ret i32 %add
 }
+
+; This matches the pattern emitted for __builtin_ffs - 1
+define i32 @cttz_32_eq_select_ffs_m1(i32 %v) nounwind {
+; NOBMI-LABEL: cttz_32_eq_select_ffs_m1:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    bsfl %edi, %ecx
+; NOBMI-NEXT:    xorl %eax, %eax
+; NOBMI-NEXT:    cmpl $1, %edi
+; NOBMI-NEXT:    sbbl %eax, %eax
+; NOBMI-NEXT:    orl %ecx, %eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: cttz_32_eq_select_ffs_m1:
+; BMI:       # %bb.0:
+; BMI-NEXT:    tzcntl %edi, %ecx
+; BMI-NEXT:    xorl %eax, %eax
+; BMI-NEXT:    cmpl $1, %edi
+; BMI-NEXT:    sbbl %eax, %eax
+; BMI-NEXT:    orl %ecx, %eax
+; BMI-NEXT:    retq
+
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %sel = select i1 %tobool, i32 -1, i32 %cnt
+  ret i32 %sel
+}
+
+define i32 @cttz_32_ne_select_ffs_m1(i32 %v) nounwind {
+; NOBMI-LABEL: cttz_32_ne_select_ffs_m1:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    bsfl %edi, %ecx
+; NOBMI-NEXT:    xorl %eax, %eax
+; NOBMI-NEXT:    cmpl $1, %edi
+; NOBMI-NEXT:    sbbl %eax, %eax
+; NOBMI-NEXT:    orl %ecx, %eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: cttz_32_ne_select_ffs_m1:
+; BMI:       # %bb.0:
+; BMI-NEXT:    tzcntl %edi, %ecx
+; BMI-NEXT:    xorl %eax, %eax
+; BMI-NEXT:    cmpl $1, %edi
+; BMI-NEXT:    sbbl %eax, %eax
+; BMI-NEXT:    orl %ecx, %eax
+; BMI-NEXT:    retq
+
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp ne i32 %v, 0
+  %sel = select i1 %tobool, i32 %cnt, i32 -1
+  ret i32 %sel
+}

From 0d4e243456809eabd6914669753eda242b5da4cb Mon Sep 17 00:00:00 2001
From: Dan Gohman <sunfish@mozilla.com>
Date: Fri, 5 Jun 2020 14:27:52 -0700
Subject: [PATCH 24/24] [WebAssembly] Improve clang diagnostics for wasm
 attributes

This patch addresses the review comments on r352930:

 - Removes redundant diagnostic checking code
 - Removes errnoneous use of diag::err_alias_is_definition, which
   turned out to be ineffective anyway since functions can be defined later
   in the translation unit and avoid detection.
 - Adds a test for various invalid cases for import_name and import_module.

This reapplies D59520, with the addition of adding
`InGroup<IgnoredAttributes>` to the new warnings, to fix the
Misc/warning-flags.c failure.

Differential Revision: https://reviews.llvm.org/D59520
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  8 +++
 clang/include/clang/Sema/Sema.h               |  4 ++
 clang/lib/Sema/SemaDecl.cpp                   |  4 ++
 clang/lib/Sema/SemaDeclAttr.cpp               | 68 +++++++++++++------
 clang/test/AST/ast-dump-wasm-attr-export.c    | 33 +++++++++
 clang/test/AST/ast-dump-wasm-attr-import.c    | 36 ++++++++++
 clang/test/Sema/attr-wasm.c                   | 27 ++++++++
 7 files changed, 161 insertions(+), 19 deletions(-)
 create mode 100644 clang/test/AST/ast-dump-wasm-attr-export.c
 create mode 100644 clang/test/AST/ast-dump-wasm-attr-import.c
 create mode 100644 clang/test/Sema/attr-wasm.c

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 0a284b9a89069..45a7f1c700b46 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10770,6 +10770,14 @@ def err_matrix_separate_incomplete_index: Error<
 def err_matrix_subscript_comma: Error<
   "comma expressions are not allowed as indices in matrix subscript expressions">;
 
+def warn_mismatched_import : Warning<
+  "import %select{module|name}0 (%1) does not match the import %select{module|name}0 (%2) of the "
+  "previous declaration">,
+  InGroup<IgnoredAttributes>;
+def warn_import_on_definition : Warning<
+  "import %select{module|name}0 cannot be applied to a function with a definition">,
+  InGroup<IgnoredAttributes>;
+
 def err_preserve_field_info_not_field : Error<
   "__builtin_preserve_field_info argument %0 not a field access">;
 def err_preserve_field_info_not_const: Error<
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 5f0a03b1c93fa..cef25fc927aa1 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2999,6 +2999,10 @@ class Sema final {
                                                 const InternalLinkageAttr &AL);
   CommonAttr *mergeCommonAttr(Decl *D, const ParsedAttr &AL);
   CommonAttr *mergeCommonAttr(Decl *D, const CommonAttr &AL);
+  WebAssemblyImportNameAttr *mergeImportNameAttr(
+      Decl *D, const WebAssemblyImportNameAttr &AL);
+  WebAssemblyImportModuleAttr *mergeImportModuleAttr(
+      Decl *D, const WebAssemblyImportModuleAttr &AL);
 
   void mergeDeclAttributes(NamedDecl *New, Decl *Old,
                            AvailabilityMergeKind AMK = AMK_Redeclaration);
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 6e52c95ad4889..025b09de0ad1c 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2598,6 +2598,10 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
     NewAttr = S.mergeSpeculativeLoadHardeningAttr(D, *SLHA);
   else if (const auto *SLHA = dyn_cast<NoSpeculativeLoadHardeningAttr>(Attr))
     NewAttr = S.mergeNoSpeculativeLoadHardeningAttr(D, *SLHA);
+  else if (const auto *IMA = dyn_cast<WebAssemblyImportModuleAttr>(Attr))
+    NewAttr = S.mergeImportModuleAttr(D, *IMA);
+  else if (const auto *INA = dyn_cast<WebAssemblyImportNameAttr>(Attr))
+    NewAttr = S.mergeImportNameAttr(D, *INA);
   else if (Attr->shouldInheritEvenIfAlreadyPresent() || !DeclHasAttr(D, Attr))
     NewAttr = cast<InheritableAttr>(Attr->clone(S.Context));
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index df44b6fcf2af5..763db5b41bb87 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5892,45 +5892,75 @@ static void handleWebAssemblyExportNameAttr(Sema &S, Decl *D, const ParsedAttr &
   D->addAttr(UsedAttr::CreateImplicit(S.Context));
 }
 
-static void handleWebAssemblyImportModuleAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  if (!isFunctionOrMethod(D)) {
-    S.Diag(D->getLocation(), diag::warn_attribute_wrong_decl_type)
-        << "'import_module'" << ExpectedFunction;
-    return;
+WebAssemblyImportModuleAttr *
+Sema::mergeImportModuleAttr(Decl *D, const WebAssemblyImportModuleAttr &AL) {
+  auto *FD = cast<FunctionDecl>(D);
+
+  if (const auto *ExistingAttr = FD->getAttr<WebAssemblyImportModuleAttr>()) {
+    if (ExistingAttr->getImportModule() == AL.getImportModule())
+      return nullptr;
+    Diag(ExistingAttr->getLocation(), diag::warn_mismatched_import) << 0
+      << ExistingAttr->getImportModule() << AL.getImportModule();
+    Diag(AL.getLoc(), diag::note_previous_attribute);
+    return nullptr;
+  }
+  if (FD->hasBody()) {
+    Diag(AL.getLoc(), diag::warn_import_on_definition) << 0;
+    return nullptr;
   }
+  return ::new (Context) WebAssemblyImportModuleAttr(Context, AL,
+                                                     AL.getImportModule());
+}
 
+WebAssemblyImportNameAttr *
+Sema::mergeImportNameAttr(Decl *D, const WebAssemblyImportNameAttr &AL) {
   auto *FD = cast<FunctionDecl>(D);
-  if (FD->isThisDeclarationADefinition()) {
-    S.Diag(D->getLocation(), diag::err_alias_is_definition) << FD << 0;
-    return;
+
+  if (const auto *ExistingAttr = FD->getAttr<WebAssemblyImportNameAttr>()) {
+    if (ExistingAttr->getImportName() == AL.getImportName())
+      return nullptr;
+    Diag(ExistingAttr->getLocation(), diag::warn_mismatched_import) << 1
+      << ExistingAttr->getImportName() << AL.getImportName();
+    Diag(AL.getLoc(), diag::note_previous_attribute);
+    return nullptr;
+  }
+  if (FD->hasBody()) {
+    Diag(AL.getLoc(), diag::warn_import_on_definition) << 1;
+    return nullptr;
   }
+  return ::new (Context) WebAssemblyImportNameAttr(Context, AL,
+                                                   AL.getImportName());
+}
+
+static void
+handleWebAssemblyImportModuleAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  auto *FD = cast<FunctionDecl>(D);
 
   StringRef Str;
   SourceLocation ArgLoc;
   if (!S.checkStringLiteralArgumentAttr(AL, 0, Str, &ArgLoc))
     return;
+  if (FD->hasBody()) {
+    S.Diag(AL.getLoc(), diag::warn_import_on_definition) << 0;
+    return;
+  }
 
   FD->addAttr(::new (S.Context)
                   WebAssemblyImportModuleAttr(S.Context, AL, Str));
 }
 
-static void handleWebAssemblyImportNameAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  if (!isFunctionOrMethod(D)) {
-    S.Diag(D->getLocation(), diag::warn_attribute_wrong_decl_type)
-        << "'import_name'" << ExpectedFunction;
-    return;
-  }
-
+static void
+handleWebAssemblyImportNameAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   auto *FD = cast<FunctionDecl>(D);
-  if (FD->isThisDeclarationADefinition()) {
-    S.Diag(D->getLocation(), diag::err_alias_is_definition) << FD << 0;
-    return;
-  }
 
   StringRef Str;
   SourceLocation ArgLoc;
   if (!S.checkStringLiteralArgumentAttr(AL, 0, Str, &ArgLoc))
     return;
+  if (FD->hasBody()) {
+    S.Diag(AL.getLoc(), diag::warn_import_on_definition) << 1;
+    return;
+  }
 
   FD->addAttr(::new (S.Context) WebAssemblyImportNameAttr(S.Context, AL, Str));
 }
diff --git a/clang/test/AST/ast-dump-wasm-attr-export.c b/clang/test/AST/ast-dump-wasm-attr-export.c
new file mode 100644
index 0000000000000..951b6cf8b0832
--- /dev/null
+++ b/clang/test/AST/ast-dump-wasm-attr-export.c
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -triple wasm32-unknown-unknown -ast-dump %s | FileCheck --strict-whitespace %s
+
+// Test that functions can be redeclared and they retain their attributes.
+
+__attribute__((export_name("export_red"))) void red(void) {}
+__attribute__((export_name("export_orange"))) void orange(void) {}
+__attribute__((export_name("export_yellow"))) void yellow(void) {}
+
+void red(void);
+void orange(void);
+void yellow(void);
+
+// CHECK: |-FunctionDecl {{.+}} used red 'void (void)'
+// CHECK: | |-CompoundStmt {{.+}}
+// CHECK: | |-WebAssemblyExportNameAttr {{.+}} "export_red"
+// CHECK: | `-UsedAttr {{.+}} Implicit
+// CHECK: |-FunctionDecl {{.+}} used orange 'void (void)'
+// CHECK: | |-CompoundStmt {{.+}}
+// CHECK: | |-WebAssemblyExportNameAttr {{.+}} "export_orange"
+// CHECK: | `-UsedAttr {{.+}} Implicit
+// CHECK: |-FunctionDecl {{.+}} used yellow 'void (void)'
+// CHECK: | |-CompoundStmt {{.+}}
+// CHECK: | |-WebAssemblyExportNameAttr {{.+}} "export_yellow"
+// CHECK: | `-UsedAttr {{.+}} Implicit
+// CHECK: |-FunctionDecl {{.+}} used red 'void (void)'
+// CHECK: | |-UsedAttr {{.+}} Inherited Implicit
+// CHECK: | `-WebAssemblyExportNameAttr {{.+}} Inherited "export_red"
+// CHECK: |-FunctionDecl {{.+}} used orange 'void (void)'
+// CHECK: | |-UsedAttr {{.+}} Inherited Implicit
+// CHECK: | `-WebAssemblyExportNameAttr {{.+}} Inherited "export_orange"
+// CHECK: `-FunctionDecl {{.+}} used yellow 'void (void)'
+// CHECK:   |-UsedAttr {{.+}} Inherited Implicit
+// CHECK:     `-WebAssemblyExportNameAttr {{.+}} Inherited "export_yellow"
diff --git a/clang/test/AST/ast-dump-wasm-attr-import.c b/clang/test/AST/ast-dump-wasm-attr-import.c
new file mode 100644
index 0000000000000..c4690eb15f271
--- /dev/null
+++ b/clang/test/AST/ast-dump-wasm-attr-import.c
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -triple wasm32-unknown-unknown -ast-dump %s | FileCheck --strict-whitespace %s
+
+// Test that functions can be redeclared and they retain their attributes.
+
+__attribute__((import_name("import_red"), import_module("mod"))) void red(void);
+__attribute__((import_name("import_orange"), import_module("mod"))) void orange(void);
+__attribute__((import_name("import_yellow"), import_module("mod"))) void yellow(void);
+
+void red(void);
+void orange(void);
+void yellow(void);
+
+void calls(void) {
+    red();
+    orange();
+    yellow();
+}
+
+// CHECK: |-FunctionDecl {{.+}} used red 'void (void)'
+// CHECK: | |-WebAssemblyImportNameAttr {{.+}} "import_red"
+// CHECK: | `-WebAssemblyImportModuleAttr {{.+}} "mod"
+// CHECK: |-FunctionDecl {{.+}} used orange 'void (void)'
+// CHECK: | |-WebAssemblyImportNameAttr {{.+}} "import_orange"
+// CHECK: | `-WebAssemblyImportModuleAttr {{.+}} "mod"
+// CHECK: |-FunctionDecl {{.+}} used yellow 'void (void)'
+// CHECK: | |-WebAssemblyImportNameAttr {{.+}} "import_yellow"
+// CHECK: | `-WebAssemblyImportModuleAttr {{.+}} "mod"
+// CHECK: |-FunctionDecl {{.+}} used red 'void (void)'
+// CHECK: | |-WebAssemblyImportNameAttr {{.+}} Inherited "import_red"
+// CHECK: | `-WebAssemblyImportModuleAttr {{.+}} Inherited "mod"
+// CHECK: |-FunctionDecl {{.+}} used orange 'void (void)'
+// CHECK: | |-WebAssemblyImportNameAttr {{.+}} Inherited "import_orange"
+// CHECK: | `-WebAssemblyImportModuleAttr {{.+}} Inherited "mod"
+// CHECK: |-FunctionDecl {{.+}} used yellow 'void (void)'
+// CHECK: | |-WebAssemblyImportNameAttr {{.+}} Inherited "import_yellow"
+// CHECK: | `-WebAssemblyImportModuleAttr {{.+}} Inherited "mod"
diff --git a/clang/test/Sema/attr-wasm.c b/clang/test/Sema/attr-wasm.c
new file mode 100644
index 0000000000000..dc30ba46ea887
--- /dev/null
+++ b/clang/test/Sema/attr-wasm.c
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -triple wasm32-unknown-unknown -fsyntax-only -verify %s
+
+void name_a() __attribute__((import_name)); //expected-error {{'import_name' attribute takes one argument}}
+
+int name_b __attribute__((import_name("foo"))); //expected-error {{'import_name' attribute only applies to functions}}
+
+void name_c() __attribute__((import_name("foo", "bar"))); //expected-error {{'import_name' attribute takes one argument}}
+
+void name_d() __attribute__((import_name("foo", "bar", "qux"))); //expected-error {{'import_name' attribute takes one argument}}
+
+void name_z() __attribute__((import_name("foo"))); //expected-note {{previous attribute is here}}
+
+void name_z() __attribute__((import_name("bar"))); //expected-warning {{import name (bar) does not match the import name (foo) of the previous declaration}}
+
+void module_a() __attribute__((import_module)); //expected-error {{'import_module' attribute takes one argument}}
+
+int module_b __attribute__((import_module("foo"))); //expected-error {{'import_module' attribute only applies to functions}}
+
+void module_c() __attribute__((import_module("foo", "bar"))); //expected-error {{'import_module' attribute takes one argument}}
+
+void module_d() __attribute__((import_module("foo", "bar", "qux"))); //expected-error {{'import_module' attribute takes one argument}}
+
+void module_z() __attribute__((import_module("foo"))); //expected-note {{previous attribute is here}}
+
+void module_z() __attribute__((import_module("bar"))); //expected-warning {{import module (bar) does not match the import module (foo) of the previous declaration}}
+
+void both() __attribute__((import_name("foo"), import_module("bar")));